MLIR 22.0.0git
GPUTransformOps.cpp
Go to the documentation of this file.
1//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10
33#include "mlir/IR/AffineExpr.h"
34#include "mlir/IR/Builders.h"
36#include "mlir/IR/IRMapping.h"
37#include "mlir/IR/MLIRContext.h"
39#include "mlir/IR/Visitors.h"
40#include "mlir/Support/LLVM.h"
42#include "llvm/ADT/STLExtras.h"
43#include "llvm/ADT/SmallVector.h"
44#include "llvm/ADT/TypeSwitch.h"
45#include "llvm/Support/DebugLog.h"
46#include "llvm/Support/ErrorHandling.h"
47#include "llvm/Support/InterleavedRange.h"
48#include "llvm/Support/LogicalResult.h"
49#include <optional>
50#include <type_traits>
51
52using namespace mlir;
53using namespace mlir::gpu;
54using namespace mlir::transform;
55using namespace mlir::transform::gpu;
56
57#define DEBUG_TYPE "gpu-transforms"
58
59//===----------------------------------------------------------------------===//
60// Apply...ConversionPatternsOp
61//===----------------------------------------------------------------------===//
62
63void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
64 TypeConverter &typeConverter, RewritePatternSet &patterns) {
65 auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
67 // Used in GPUToNVVM/WmmaOpsToNvvm.cpp so attaching here for now.
68 // TODO: We should have a single to_nvvm_type_converter.
69 llvmTypeConverter.addConversion(
70 [&](MMAMatrixType type) -> Type { return convertMMAToLLVMType(type); });
71 // Set higher benefit, so patterns will run before generic LLVM lowering.
73 getBenefit());
74}
75
76LogicalResult
77transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
78 transform::TypeConverterBuilderOpInterface builder) {
79 if (builder.getTypeConverterType() != "LLVMTypeConverter")
80 return emitOpError("expected LLVMTypeConverter");
81 return success();
82}
83
84void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
85 TypeConverter &typeConverter, RewritePatternSet &patterns) {
86 auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
88}
89
90LogicalResult
91transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
92 transform::TypeConverterBuilderOpInterface builder) {
93 if (builder.getTypeConverterType() != "LLVMTypeConverter")
94 return emitOpError("expected LLVMTypeConverter");
95 return success();
96}
97
98void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
99 populatePatterns(TypeConverter &typeConverter,
101 auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
103}
104
105LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
106 verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
107 if (builder.getTypeConverterType() != "LLVMTypeConverter")
108 return emitOpError("expected LLVMTypeConverter");
109 return success();
110}
111
112void transform::ApplyGPUToROCDLConversionPatternsOp::populatePatterns(
113 TypeConverter &typeConverter, RewritePatternSet &patterns) {
114 auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
116 FailureOr<amdgpu::Chipset> maybeChipset =
117 amdgpu::Chipset::parse(getChipset());
118 assert(llvm::succeeded(maybeChipset) && "expected valid chipset");
120 llvmTypeConverter, patterns, mlir::gpu::amd::Runtime::HIP, *maybeChipset);
121}
122
123LogicalResult
124transform::ApplyGPUToROCDLConversionPatternsOp::verifyTypeConverter(
125 transform::TypeConverterBuilderOpInterface builder) {
126 FailureOr<amdgpu::Chipset> maybeChipset =
127 amdgpu::Chipset::parse(getChipset());
128 if (failed(maybeChipset)) {
129 return emitOpError("Invalid chipset name: " + getChipset());
130 }
131 if (builder.getTypeConverterType() != "LLVMTypeConverter")
132 return emitOpError("expected LLVMTypeConverter");
133 return success();
134}
135
136//===----------------------------------------------------------------------===//
137// Apply...PatternsOp
138//===----------------------------------------------------------------------===//s
139
140void ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) {
142}
143
144void transform::ApplyGPUPromoteShuffleToAMDGPUPatternsOp::populatePatterns(
146 std::optional<StringRef> chipsetName = getChipset();
147 std::optional<amdgpu::Chipset> maybeChipset;
148 if (chipsetName) {
149 FailureOr<amdgpu::Chipset> parsedChipset =
150 amdgpu::Chipset::parse(*chipsetName);
151 assert(llvm::succeeded(parsedChipset) && "expected valid chipset");
152 maybeChipset = parsedChipset;
153 }
154
156}
157
158//===----------------------------------------------------------------------===//
159// ApplyUnrollVectorsSubgroupMmaOp
160//===----------------------------------------------------------------------===//
161
162/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
163/// register.
164static std::optional<SmallVector<int64_t>>
165gpuMmaUnrollOrder(vector::ContractionOp contract) {
167 // First make reduction the outer dimensions.
168 for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
169 if (vector::isReductionIterator(iter)) {
170 order.push_back(index);
171 }
172 }
173
174 llvm::SmallDenseSet<int64_t> dims;
175 for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
176 dims.insert(cast<AffineDimExpr>(expr).getPosition());
177 }
178 // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
179 for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
180 if (vector::isParallelIterator(iter) && dims.count(index)) {
181 order.push_back(index);
182 }
183 }
184 // Then the remaining parallel loops.
185 for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
186 if (vector::isParallelIterator(iter) && !dims.count(index)) {
187 order.push_back(index);
188 }
189 }
190 return order;
191}
192
193/// Returns the target vector size for the target operation based on the native
194/// vector size specified with `m`, `n`, and `k`.
195static std::optional<SmallVector<int64_t>>
197 if (auto contract = dyn_cast<vector::ContractionOp>(op)) {
198 int64_t contractRank = contract.getIteratorTypes().size();
199 if (contractRank < 3)
200 return std::nullopt;
201 SmallVector<int64_t> nativeSize(contractRank - 3, 1);
202 nativeSize.append({m, n, k});
203 return nativeSize;
204 }
205 if (auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
206 int64_t writeRank = writeOp.getVectorType().getRank();
207 if (writeRank < 2)
208 return std::nullopt;
209 SmallVector<int64_t> nativeSize(writeRank - 2, 1);
210 nativeSize.append({m, n});
211 return nativeSize;
212 }
213 if (auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
214 // Transfer read ops may need different shapes based on how they are being
215 // used. For simplicity just match the shape used by the extract strided op.
216 VectorType sliceType;
217 for (Operation *users : op->getUsers()) {
218 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
219 if (!extract)
220 return std::nullopt;
221 auto vecType = cast<VectorType>(extract.getResult().getType());
222 if (sliceType && sliceType != vecType)
223 return std::nullopt;
224 sliceType = vecType;
225 }
226 return llvm::to_vector(sliceType.getShape());
227 }
229 if (auto vecType = dyn_cast<VectorType>(op->getResultTypes()[0])) {
230 // TODO: The condition for unrolling elementwise should be restricted
231 // only to operations that need unrolling (connected to the contract).
232 if (vecType.getRank() < 2)
233 return std::nullopt;
234
235 // First check whether there is a slice to infer the shape from. This is
236 // required for cases where the accumulator type differs from the input
237 // types, in which case we will see an `arith.ext_` between the contract
238 // and transfer_read which needs to be unrolled.
239 VectorType sliceType;
240 for (Operation *users : op->getUsers()) {
241 auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
242 if (!extract)
243 return std::nullopt;
244 auto vecType = cast<VectorType>(extract.getResult().getType());
245 if (sliceType && sliceType != vecType)
246 return std::nullopt;
247 sliceType = vecType;
248 }
249 if (sliceType)
250 return llvm::to_vector(sliceType.getShape());
251
252 // Else unroll for trailing elementwise.
253 SmallVector<int64_t> nativeSize(vecType.getRank() - 2, 1);
254 // Map elementwise ops to the output shape.
255 nativeSize.append({m, n});
256 return nativeSize;
257 }
258 }
259 return std::nullopt;
260}
261
262void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
264 auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> {
265 auto contract = dyn_cast<vector::ContractionOp>(op);
266 if (!contract)
267 return std::nullopt;
269 };
270
271 int64_t m = getM();
272 int64_t n = getN();
273 int64_t k = getK();
274 auto nativeShapeFn =
275 [m, n, k](Operation *op) -> std::optional<SmallVector<int64_t>> {
276 return getSubgroupMmaNativeVectorSize(op, m, n, k);
277 };
278 vector::populateVectorUnrollPatterns(
280 .setNativeShapeFn(nativeShapeFn)
281 .setUnrollTraversalOrderFn(unrollOrder));
282}
283
284//===----------------------------------------------------------------------===//
285// EliminateBarriersOp
286//===----------------------------------------------------------------------===//
287
288void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
290}
291
292//===----------------------------------------------------------------------===//
293// Block and thread mapping utilities.
294//===----------------------------------------------------------------------===//
295
296namespace {
297/// Local types used for mapping verification.
298struct MappingKind {};
299struct BlockMappingKind : MappingKind {};
300struct ThreadMappingKind : MappingKind {};
301} // namespace
302
304definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
305 Operation *target, const Twine &message) {
306 if (transformOp.has_value())
307 return transformOp->emitDefiniteFailure() << message;
308 return emitDefiniteFailure(target, message);
309}
310
311/// Check if given mapping attributes are one of the desired attributes
312template <typename MappingKindType>
314checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
315 scf::ForallOp forallOp) {
316 if (!forallOp.getMapping().has_value()) {
317 return definiteFailureHelper(transformOp, forallOp,
318 "scf.forall op requires a mapping attribute");
319 }
320
321 bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
322 llvm::IsaPred<GPUBlockMappingAttr>);
323 bool hasWarpgroupMapping = llvm::any_of(
324 forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
325 bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
326 llvm::IsaPred<GPUWarpMappingAttr>);
327 bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
328 llvm::IsaPred<GPUThreadMappingAttr>);
329 bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
330 llvm::IsaPred<GPULaneMappingAttr>);
331 int64_t countMappingTypes = 0;
332 countMappingTypes += hasBlockMapping ? 1 : 0;
333 countMappingTypes += hasWarpgroupMapping ? 1 : 0;
334 countMappingTypes += hasWarpMapping ? 1 : 0;
335 countMappingTypes += hasThreadMapping ? 1 : 0;
336 countMappingTypes += hasLaneMapping ? 1 : 0;
337 if (countMappingTypes > 1) {
339 transformOp, forallOp,
340 "cannot mix different mapping types, use nesting");
341 }
342 if (std::is_same<MappingKindType, BlockMappingKind>::value &&
343 !hasBlockMapping) {
345 transformOp, forallOp,
346 "scf.forall op requires a mapping attribute of kind 'block'");
347 }
348 if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
349 !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
350 !hasWarpgroupMapping) {
351 return definiteFailureHelper(transformOp, forallOp,
352 "scf.forall op requires a mapping attribute "
353 "of kind 'thread' or 'warp'");
354 }
355
357 for (Attribute map : forallOp.getMapping()->getValue()) {
358 if (seen.contains(map)) {
360 transformOp, forallOp,
361 "duplicate attribute, cannot map different loops "
362 "to the same mapping id");
363 }
364 seen.insert(map);
365 }
366
367 auto isLinear = [](DeviceMappingAttrInterface attr) {
368 return attr.isLinearMapping();
369 };
370 if (llvm::any_of(forallOp.getDeviceMappingAttrs(), isLinear) &&
371 !llvm::all_of(forallOp.getDeviceMappingAttrs(), isLinear)) {
373 transformOp, forallOp,
374 "cannot mix linear and non-linear mapping modes");
375 }
376
377 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
378 forallOp.getDeviceMaskingAttr();
379 if (succeeded(maybeMaskingAttr) && *maybeMaskingAttr &&
380 !forallOp.usesLinearMapping()) {
382 transformOp, forallOp,
383 "device masking is only available in linear mapping mode");
384 }
385
387}
388
389template <typename MappingKindType>
391verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
392 scf::ForallOp forallOp) {
393 // Check the types of the mapping attributes match.
395 checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
396 if (!typeRes.succeeded())
397 return typeRes;
398
399 // Perform other non-types verifications.
400 if (!forallOp.isNormalized())
401 return definiteFailureHelper(transformOp, forallOp,
402 "unsupported non-normalized loops");
403 if (forallOp.getNumResults() > 0)
404 return definiteFailureHelper(transformOp, forallOp,
405 "only bufferized scf.forall can be mapped");
406 bool useLinearMapping = forallOp.usesLinearMapping();
407 // TODO: This would be more natural with support for Optional<EnumParameter>
408 // in GPUDeviceMappingAttr.
409 int64_t maxNumMappingsSupported =
410 useLinearMapping ? (getMaxEnumValForMappingId() -
411 static_cast<uint64_t>(MappingId::DimZ))
412 : 3;
413 if (forallOp.getRank() > maxNumMappingsSupported) {
414 return definiteFailureHelper(transformOp, forallOp,
415 "scf.forall with rank > ")
416 << maxNumMappingsSupported
417 << " does not lower for the specified mapping attribute type";
418 }
419 auto numParallelIterations =
420 getConstantIntValues(forallOp.getMixedUpperBound());
421 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
423 transformOp, forallOp,
424 "requires statically sized, normalized forall op");
425 }
427}
428
429/// Struct to return the result of the rewrite of a forall operation.
434
435/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
436template <typename OpTy, typename OperationOrBlock>
437static void
439 OperationOrBlock *parent, Value replacement,
440 ArrayRef<int64_t> availableMappingSizes) {
441 parent->walk([&](OpTy idOp) {
442 if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)
443 rewriter.replaceAllUsesWith(idOp.getResult(), replacement);
444 });
445}
446
448 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
449 scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
450 ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) {
451 LDBG() << "--start rewriteOneForallCommonImpl";
452
453 // Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
454 auto numParallelIterations =
455 getConstantIntValues(forallOp.getMixedUpperBound());
456 assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
457 "requires statically sized, normalized forall op");
458 SmallVector<int64_t> tmpMappingSizes = numParallelIterations.value();
459 SmallVector<DeviceMappingAttrInterface> forallMappingAttrsVec =
460 forallOp.getDeviceMappingAttrs();
461 SetVector<Attribute> forallMappingAttrs;
462 forallMappingAttrs.insert_range(forallMappingAttrsVec);
463 auto comparator = [](Attribute a, Attribute b) -> bool {
464 return cast<DeviceMappingAttrInterface>(a).getMappingId() <
465 cast<DeviceMappingAttrInterface>(b).getMappingId();
466 };
467
468 // Step 1.b. In the linear case, compute the max mapping to avoid needlessly
469 // mapping all dimensions. In the 3-D mapping case we need to map all
470 // dimensions.
471 DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
472 *llvm::max_element(forallMappingAttrs, comparator));
473 DeviceMappingAttrInterface maxLinearMapping;
474 if (maxMapping.isLinearMapping())
475 maxLinearMapping = maxMapping;
476 for (auto attr : gpuIdBuilder.mappingAttributes) {
477 // If attr overflows, just skip.
478 if (maxLinearMapping && comparator(maxLinearMapping, attr))
479 continue;
480 // Try to insert. If element was already present, just continue.
481 if (!forallMappingAttrs.insert(attr))
482 continue;
483 // Otherwise, we have a new insertion without a size -> use size 1.
484 tmpMappingSizes.push_back(1);
485 }
486 LDBG() << "----tmpMappingSizes extracted from scf.forall op: "
487 << llvm::interleaved(tmpMappingSizes);
488
489 // Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
490 SmallVector<int64_t> forallMappingSizes = getValuesSortedByKey(
491 forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
492 LDBG() << "----forallMappingSizes: " << llvm::interleaved(forallMappingSizes);
493 LDBG() << "----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs);
494
495 // Step 3. Generate the mappingIdOps using the provided generator.
496 Location loc = forallOp.getLoc();
497 OpBuilder::InsertionGuard guard(rewriter);
498 rewriter.setInsertionPoint(forallOp);
499 SmallVector<int64_t> originalBasis(availableMappingSizes);
500 bool originalBasisWasProvided = !originalBasis.empty();
501 if (!originalBasisWasProvided) {
502 LDBG() << "----originalBasis was not provided, deriving it and there will "
503 "be no "
504 "predication";
505 originalBasis = forallMappingSizes;
506 while (originalBasis.size() < 3)
507 originalBasis.push_back(1);
508 } else {
509 LDBG() << "----originalBasis was provided, using it, there will be "
510 "predication";
511 }
512 LDBG() << "------originalBasis: " << llvm::interleaved(originalBasis);
513
514 IdBuilderResult builderResult =
515 gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
516 if (!builderResult.errorMsg.empty())
517 return definiteFailureHelper(transformOp, forallOp, builderResult.errorMsg);
518
519 LDBG() << builderResult;
520
521 // Step 4. Map the induction variables to the mappingIdOps, this may involve
522 // a permutation.
523 SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;
524 IRMapping bvm;
525 for (auto [iv, dim] : llvm::zip_equal(
526 forallOp.getInductionVars(),
527 forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
528 auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
529 Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
530 LDBG() << "----map: " << iv << " to " << peIdOp;
531 bvm.map(iv, peIdOp);
532 }
533
534 // Step 5. If the originalBasis is already known, create conditionals to
535 // predicate the region. Otherwise, the current forall determines the
536 // originalBasis and no predication occurs.
537 Value predicate;
538 if (originalBasisWasProvided) {
539 for (Value tmpPredicate : builderResult.predicateOps) {
540 predicate = predicate ? arith::AndIOp::create(rewriter, loc, predicate,
541 tmpPredicate)
542 : tmpPredicate;
543 }
544 }
545
546 // Step 6. Move the body of forallOp.
547 // Erase the terminator first, it will not be used.
548 rewriter.eraseOp(forallOp.getTerminator());
549 Block *targetBlock;
550 Block::iterator insertionPoint;
551 if (predicate) {
552 // Step 6.a. If predicated, move at the beginning.
553 auto ifOp = scf::IfOp::create(rewriter, loc, predicate,
554 /*withElseRegion=*/false);
555 targetBlock = ifOp.thenBlock();
556 insertionPoint = ifOp.thenBlock()->begin();
557 } else {
558 // Step 6.b. Otherwise, move inline just at the rewriter insertion
559 // point.
560 targetBlock = forallOp->getBlock();
561 insertionPoint = rewriter.getInsertionPoint();
562 }
563 Block &sourceBlock = forallOp.getRegion().front();
564 targetBlock->getOperations().splice(insertionPoint,
565 sourceBlock.getOperations());
566
567 // Step 7. RAUW indices.
568 for (Value loopIndex : forallOp.getInductionVars()) {
569 Value threadIdx = bvm.lookup(loopIndex);
570 rewriter.replaceAllUsesWith(loopIndex, threadIdx);
571 }
572
573 // Step 8. Erase old op.
574 rewriter.eraseOp(forallOp);
575
576 LDBG() << "----result forallMappingSizes: "
577 << llvm::interleaved(forallMappingSizes);
578 LDBG() << "----result mappingIdOps: " << llvm::interleaved(mappingIdOps);
579
580 result = ForallRewriteResult{forallMappingSizes, mappingIdOps};
582}
583
584//===----------------------------------------------------------------------===//
585// MapForallToBlocks
586//===----------------------------------------------------------------------===//
587
589 RewriterBase &rewriter, TransformOpInterface transformOp,
590 scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
591 const GpuIdBuilder &gpuIdBuilder) {
592 LDBG() << "Start mapForallToBlocksImpl";
593
594 {
595 // GPU-specific verifications. There is no better place to anchor
596 // those right now: the ForallOp is target-independent and the transform
597 // op does not apply to individual ForallOp.
599 verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
600 if (!diag.succeeded())
601 return diag;
602 }
603
604 Location loc = forallOp.getLoc();
605 Block *parentBlock = forallOp->getBlock();
606 Value zero;
607 {
608 // Create an early zero index value for replacements and immediately reset
609 // the insertion point.
610 OpBuilder::InsertionGuard guard(rewriter);
611 rewriter.setInsertionPointToStart(parentBlock);
612 zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
613 }
614
615 ForallRewriteResult rewriteResult;
617 rewriter, transformOp, forallOp,
618 /*availableMappingSizes=*/gridDims, rewriteResult, gpuIdBuilder);
619
620 // Return if anything goes wrong, use silenceable failure as a match
621 // failure.
622 if (!diag.succeeded())
623 return diag;
624
625 // If gridDims was not provided already, set it from the return.
626 if (gridDims.empty()) {
627 gridDims = rewriteResult.mappingSizes;
628 while (gridDims.size() < 3)
629 gridDims.push_back(1);
630 }
631 assert(gridDims.size() == 3 && "Need 3-D gridDims");
632
633 // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
634 // Here, the result of mapping determines the available mapping sizes.
635 replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
636 rewriteResult.mappingSizes);
637
639}
640
643 scf::ForallOp &topLevelForallOp,
644 TransformOpInterface transformOp) {
645 auto walkResult = target->walk([&](scf::ForallOp forallOp) {
646 if (forallOp->getParentOfType<scf::ForallOp>())
647 return WalkResult::advance();
648 if (topLevelForallOp)
649 // TODO: Handle multiple forall if they are independent.
650 return WalkResult::interrupt();
651 topLevelForallOp = forallOp;
652 return WalkResult::advance();
653 });
654
655 if (walkResult.wasInterrupted() || !topLevelForallOp)
656 return transformOp.emitSilenceableError()
657 << "could not find a unique topLevel scf.forall";
659}
660
661DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
664 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
665 auto transformOp = cast<TransformOpInterface>(getOperation());
666
667 if (!getGenerateGpuLaunch() && !gpuLaunch) {
669 emitSilenceableError()
670 << "Given target is not gpu.launch, set `generate_gpu_launch` "
671 "attribute";
672 diag.attachNote(target->getLoc()) << "when applied to this payload op";
673 return diag;
674 }
675
676 scf::ForallOp topLevelForallOp;
678 target, topLevelForallOp, transformOp);
679 if (!diag.succeeded()) {
680 diag.attachNote(target->getLoc()) << "when applied to this payload op";
681 return diag;
682 }
683 assert(topLevelForallOp && "expect an scf.forall");
684
685 SmallVector<int64_t> gridDims{getGridDims()};
686 if (!getGenerateGpuLaunch() && gridDims.size() != 3)
687 return transformOp.emitDefiniteFailure("transform require size-3 mapping");
688
689 OpBuilder::InsertionGuard guard(rewriter);
690 rewriter.setInsertionPoint(topLevelForallOp);
691
692 // Generate gpu launch here and move the forall inside
693 if (getGenerateGpuLaunch()) {
695 createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
696 if (!diag.succeeded())
697 return diag;
698
699 rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
700 Operation *newForallOp = rewriter.clone(*topLevelForallOp);
701 rewriter.eraseOp(topLevelForallOp);
702 topLevelForallOp = cast<scf::ForallOp>(newForallOp);
703 }
704
705 // The BlockIdBuilder adapts to whatever is thrown at it.
706 bool useLinearMapping = false;
707 if (topLevelForallOp.getMapping())
708 useLinearMapping = topLevelForallOp.usesLinearMapping();
709
710 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
711 topLevelForallOp.getDeviceMaskingAttr();
712 assert(succeeded(maybeMaskingAttr) && "unexpected failed maybeMaskingAttr");
713 assert((!*maybeMaskingAttr || useLinearMapping) &&
714 "masking requires linear mapping");
715
716 GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping,
717 *maybeMaskingAttr);
718
720 rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
721 if (!diag.succeeded())
722 return diag;
723
724 // Set the GPU launch configuration for the grid dims late, this is
725 // subject to IR inspection.
726 diag = alterGpuLaunch(rewriter, gpuLaunch,
727 cast<TransformOpInterface>(getOperation()), gridDims[0],
728 gridDims[1], gridDims[2]);
729
730 results.push_back(gpuLaunch);
731 return diag;
732}
733
734LogicalResult transform::MapForallToBlocks::verify() {
735 if (!getGridDims().empty() && getGridDims().size() != 3) {
736 return emitOpError() << "transform requires empty or size-3 grid_dims";
737 }
738 return success();
739}
740
741//===----------------------------------------------------------------------===//
742// MapNestedForallToThreads
743//===----------------------------------------------------------------------===//
744
746 std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
747 ArrayRef<int64_t> numParallelIterations, ArrayRef<int64_t> blockOrGridSizes,
748 int factor, bool useLinearMapping = false) {
749 if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
751 transformOp, forallOp,
752 Twine("3-D mapping: size of threadIdx.x must be a multiple of ") +
753 Twine(factor));
754 return diag;
755 }
756 if (computeProduct(numParallelIterations) * factor >
757 computeProduct(blockOrGridSizes)) {
759 transformOp, forallOp,
760 Twine("the number of required parallel resources (blocks or "
761 "threads) ") +
762 Twine(computeProduct(numParallelIterations) * factor) +
763 " overflows the number of available resources " +
764 Twine(computeProduct(blockOrGridSizes)));
765 return diag;
766 }
768}
769
771getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
772 scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes,
773 int64_t warpSize, GpuIdBuilder &gpuIdBuilder) {
774 DeviceMappingAttrInterface mappingAttr =
775 forallOp.getDeviceMappingAttrs().front();
776 bool useLinearMapping = mappingAttr.isLinearMapping();
777
778 // Sanity checks that may result in runtime verification errors.
779 auto numParallelIterations =
780 getConstantIntValues((forallOp.getMixedUpperBound()));
781 if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
783 transformOp, forallOp,
784 "requires statically sized, normalized forall op");
785 }
786 int64_t factor = 1;
787 if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
789 } else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
790 factor = warpSize;
791 }
793 checkMappingSpec(transformOp, forallOp, numParallelIterations.value(),
794 blockSizes, factor, useLinearMapping);
795 if (!diag.succeeded())
796 return diag;
797
798 FailureOr<DeviceMaskingAttrInterface> maybeMaskingAttr =
799 forallOp.getDeviceMaskingAttr();
800 assert(succeeded(maybeMaskingAttr) && "unexpected failed maybeMaskingAttr");
801 assert((!*maybeMaskingAttr || useLinearMapping) &&
802 "masking requires linear mapping");
803
804 // Start mapping.
805 MLIRContext *ctx = forallOp.getContext();
806 gpuIdBuilder =
808 .Case([&](GPUWarpgroupMappingAttr) {
809 return GpuWarpgroupIdBuilder(ctx, warpSize, useLinearMapping,
810 *maybeMaskingAttr);
811 })
812 .Case([&](GPUWarpMappingAttr) {
813 return GpuWarpIdBuilder(ctx, warpSize, useLinearMapping,
814 *maybeMaskingAttr);
815 })
816 .Case([&](GPUThreadMappingAttr) {
817 return GpuThreadIdBuilder(ctx, useLinearMapping, *maybeMaskingAttr);
818 })
819 .Case([&](GPULaneMappingAttr) {
820 return GpuLaneIdBuilder(ctx, warpSize, useLinearMapping,
821 *maybeMaskingAttr);
822 })
823 .DefaultUnreachable("unknown mapping attribute");
825}
826
828 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
829 scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, int64_t warpSize,
830 bool syncAfterDistribute) {
831
832 {
833 // GPU-specific verifications. There is no better place to anchor
834 // those right now: the ForallOp is target-independent and the transform
835 // op does not apply to individual ForallOp.
837 verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
838 if (!diag.succeeded())
839 return diag;
840 }
841
842 GpuIdBuilder gpuIdBuilder;
843 {
844 // Try to construct the id builder, if it fails, return.
846 transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
847 if (!diag.succeeded())
848 return diag;
849 }
850
851 Location loc = forallOp.getLoc();
852 OpBuilder::InsertionGuard g(rewriter);
853 // Insert after to allow for syncthreads after `forall` is erased.
854 rewriter.setInsertionPointAfter(forallOp);
855 ForallRewriteResult rewriteResult;
857 rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
858 if (!diag.succeeded())
859 return diag;
860 // Add a syncthreads if needed. TODO: warpsync
861 if (syncAfterDistribute)
862 BarrierOp::create(rewriter, loc);
863
865}
866
868 RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
869 Operation *target, ArrayRef<int64_t> blockDims, int64_t warpSize,
870 bool syncAfterDistribute) {
871 LDBG() << "Start mapNestedForallToThreadsImpl";
872 if (blockDims.size() != 3) {
873 return definiteFailureHelper(transformOp, target,
874 "requires size-3 thread mapping");
875 }
876
877 // Create an early zero index value for replacements.
878 Location loc = target->getLoc();
879 Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
881 WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
883 rewriter, transformOp, forallOp, blockDims, warpSize,
884 syncAfterDistribute);
885 if (diag.isDefiniteFailure())
886 return WalkResult::interrupt();
887 if (diag.succeeded())
888 return WalkResult::skip();
889 return WalkResult::advance();
890 });
891 if (walkResult.wasInterrupted())
892 return diag;
893
894 // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
895 // Here, the result of mapping determines the available mapping sizes.
897 blockDims);
898
900}
901
902DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
904 ApplyToEachResultList &results, TransformState &state) {
905 LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
906 auto transformOp = cast<TransformOpInterface>(getOperation());
907
908 // Basic high-level verifications.
909 if (!gpuLaunch)
910 return emitSilenceableError() << "Given target is not a gpu.launch";
911
912 // Mapping to block ids.
913 SmallVector<int64_t> blockDims{getBlockDims()};
915 checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
916 blockDims[0], blockDims[1], blockDims[2]);
917 if (diag.isSilenceableFailure()) {
918 diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";
919 return diag;
920 }
921
922 // Set the GPU launch configuration for the block dims early, this is not
923 // subject to IR inspection.
924 diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,
925 std::nullopt, std::nullopt, blockDims[0], blockDims[1],
926 blockDims[2]);
927
928 rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
929 diag =
930 mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims,
931 getWarpSize(), getSyncAfterDistribute());
932
933 results.push_back(gpuLaunch.getOperation());
934 return diag;
935}
936
937//===----------------------------------------------------------------------===//
938// Transform op registration
939//===----------------------------------------------------------------------===//
940
941namespace {
942/// Registers new ops and declares PDL as dependent dialect since the
943/// additional ops are using PDL types for operands and results.
944class GPUTransformDialectExtension
946 GPUTransformDialectExtension> {
947public:
948 MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GPUTransformDialectExtension)
949
950 GPUTransformDialectExtension() {
951 declareGeneratedDialect<GPUDialect>();
952 declareGeneratedDialect<amdgpu::AMDGPUDialect>();
953 declareGeneratedDialect<arith::ArithDialect>();
954 declareGeneratedDialect<scf::SCFDialect>();
955 registerTransformOps<
956#define GET_OP_LIST
957#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
958 >();
959 }
960};
961} // namespace
962
963#define GET_OP_CLASSES
964#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
965
967 registry.addExtensions<GPUTransformDialectExtension>();
968}
return success()
p<< " : "<< getMemRefType()<< ", "<< getType();}static LogicalResult verifyVectorMemoryOp(Operation *op, MemRefType memrefType, VectorType vectorType) { if(memrefType.getElementType() !=vectorType.getElementType()) return op-> emitOpError("requires memref and vector types of the same elemental type")
Given a list of lists of parsed operands, populates uniqueOperands with unique operands.
static DiagnosedSilenceableFailure checkMappingAttributeTypes(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
Check if given mapping attributes are one of the desired attributes.
static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > availableMappingSizes, ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder)
static std::optional< SmallVector< int64_t > > getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k)
Returns the target vector size for the target operation based on the native vector size specified wit...
static DiagnosedSilenceableFailure definiteFailureHelper(std::optional< TransformOpInterface > transformOp, Operation *target, const Twine &message)
static DiagnosedSilenceableFailure checkMappingSpec(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > numParallelIterations, ArrayRef< int64_t > blockOrGridSizes, int factor, bool useLinearMapping=false)
static void replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc, OperationOrBlock *parent, Value replacement, ArrayRef< int64_t > availableMappingSizes)
Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
static DiagnosedSilenceableFailure verifyGpuMapping(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
static DiagnosedSilenceableFailure getThreadIdBuilder(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockSizes, int64_t warpSize, GpuIdBuilder &gpuIdBuilder)
static std::optional< SmallVector< int64_t > > gpuMmaUnrollOrder(vector::ContractionOp contract)
Pick an unrolling order that will allow tensorcore operation to reuse LHS register.
b
Return true if permutation is a valid permutation of the outer_dims_perm (case OuterOrInnerPerm::Oute...
b getContext())
*if copies could not be generated due to yet unimplemented cases *copyInPlacementStart and copyOutPlacementStart in copyPlacementBlock *specify the insertion points where the incoming copies and outgoing should be the output argument nBegin is set to its * replacement(set to `begin` if no invalidation happens). Since outgoing *copies could have been inserted at `end`
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Definition TypeID.h:331
Base type for affine expression.
Definition AffineExpr.h:68
Attributes are known-constant values of operations.
Definition Attributes.h:25
Block represents an ordered list of Operations.
Definition Block.h:33
OpListType::iterator iterator
Definition Block.h:140
OpListType & getOperations()
Definition Block.h:137
Operation & front()
Definition Block.h:153
iterator begin()
Definition Block.h:143
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
Definition IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition IRMapping.h:30
Conversion from types to the LLVM IR dialect.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
RAII guard to reset the insertion point of the builder when destroyed.
Definition Builders.h:348
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Definition Builders.h:445
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition Builders.cpp:562
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition Builders.h:431
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition Builders.h:398
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition Builders.h:412
Operation is the basic unit of execution within MLIR.
Definition Operation.h:88
result_type_range getResultTypes()
Definition Operation.h:428
user_range getUsers()
Returns a range of all users.
Definition Operation.h:873
unsigned getNumResults()
Return the number of results held by this operation.
Definition Operation.h:404
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
virtual void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
A utility result that is used to signal how to proceed with an ongoing walk:
Definition WalkResult.h:29
static WalkResult skip()
Definition WalkResult.h:48
static WalkResult advance()
Definition WalkResult.h:47
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition WalkResult.h:51
static WalkResult interrupt()
Definition WalkResult.h:46
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Definition GPUDialect.h:131
A list of results of applying a transform op with ApplyEachOpTrait to a single payload operation,...
void push_back(Operation *op)
Appends an element to the list.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
This is a special rewriter to be used in transform op implementations, providing additional helper fu...
The state maintained across applications of various ops implementing the TransformOpInterface.
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
void registerTransformDialectExtension(DialectRegistry &registry)
void populateCommonGPUTypeAndAttributeConversions(TypeConverter &typeConverter)
Remap common GPU memory spaces (Workgroup, Private, etc) to LLVM address spaces.
detail::InFlightRemark failed(Location loc, RemarkOpts opts)
Report an optimization remark that failed.
Definition Remarks.h:573
uint64_t getN(LevelType lt)
Definition Enums.h:442
uint64_t getM(LevelType lt)
Definition Enums.h:443
DiagnosedSilenceableFailure findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp, TransformOpInterface transformOp)
Find the unique top level scf::ForallOp within a given target op.
DiagnosedSilenceableFailure alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Alter kernel configuration of the given kernel.
DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Create an empty-body gpu::LaunchOp using the provided kernel settings and put a terminator within.
DiagnosedSilenceableFailure mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp, scf::ForallOp forallOp, SmallVectorImpl< int64_t > &gridDims, const GpuIdBuilder &gpuIdBuilder)
Map the top level scf.forall op to GPU blocks.
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional< int64_t > gridDimX, std::optional< int64_t > gridDimY, std::optional< int64_t > gridDimZ, std::optional< int64_t > blockDimX, std::optional< int64_t > blockDimY, std::optional< int64_t > blockDimZ)
Determine if the size of the kernel configuration is supported by the GPU architecture being used.
Definition Utils.cpp:360
DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, Operation *target, ArrayRef< int64_t > blockDims, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
DiagnosedSilenceableFailure mapOneForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockSizes, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
Definition VectorOps.h:156
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Definition VectorOps.h:151
Include the generated interface declarations.
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)
Collect a set of patterns to convert from the GPU dialect to ROCDL.
llvm::DenseSet< ValueT, ValueInfoT > DenseSet
Definition LLVM.h:128
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition Passes.h:91
Type convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
llvm::SetVector< T, Vector, Set, N > SetVector
Definition LLVM.h:131
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
const FrozenRewritePatternSet & patterns
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
llvm::TypeSwitch< T, ResultT > TypeSwitch
Definition LLVM.h:144
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns, std::optional< amdgpu::Chipset > maybeChipset)
Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.
Definition Chipset.cpp:14
Builder for gpu::BlockIdOps used to map scf.forall to blocks.
Definition Utils.h:81
Helper struct for configuring the rewrite of mapped scf.forall ops to various gpu id configurations.
Definition Utils.h:60
SmallVector< DeviceMappingAttrInterface > mappingAttributes
The mapping attributes targeted by this generator.
Definition Utils.h:69
GpuIdBuilderFnType idBuilder
The constructor that builds the concrete IR for mapping ids.
Definition Utils.h:72
Builder for warp ids used to map scf.forall to reindexed threads.
Definition Utils.h:120
Builder for warp ids used to map scf.forall to reindexed warps.
Definition Utils.h:107
Builder for warpgroup ids used to map scf.forall to reindexed warpgroups.
Definition Utils.h:92
static constexpr int64_t kNumWarpsPerGroup
In the future this may be configured by the transformation.
Definition Utils.h:98
Helper type for functions that generate ids for the mapping of a scf.forall.
Definition Utils.h:31
std::string errorMsg
Error message, if not empty then building the ids failed.
Definition Utils.h:33
SmallVector< Value > predicateOps
Values used to predicate the forall body when activeMappingSizes is smaller than the available mappin...
Definition Utils.h:38
SmallVector< Value > mappingIdOps
Values used to replace the forall induction variables.
Definition Utils.h:35
Options that control the vector unrolling.