MLIR  19.0.0git
GPUTransformOps.cpp
Go to the documentation of this file.
1 //===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
29 #include "mlir/IR/AffineExpr.h"
30 #include "mlir/IR/Builders.h"
32 #include "mlir/IR/IRMapping.h"
33 #include "mlir/IR/MLIRContext.h"
34 #include "mlir/IR/OpDefinition.h"
35 #include "mlir/IR/Visitors.h"
36 #include "mlir/Support/LLVM.h"
38 #include "llvm/ADT/STLExtras.h"
39 #include "llvm/ADT/SmallVector.h"
40 #include "llvm/ADT/TypeSwitch.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include <type_traits>
44 
45 using namespace mlir;
46 using namespace mlir::gpu;
47 using namespace mlir::transform;
48 using namespace mlir::transform::gpu;
49 
50 #define DEBUG_TYPE "gpu-transforms"
51 #define DEBUG_TYPE_ALIAS "gpu-transforms-alias"
52 
53 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
54 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
55 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
56 
57 //===----------------------------------------------------------------------===//
58 // Apply...ConversionPatternsOp
59 //===----------------------------------------------------------------------===//
60 
61 void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
62  TypeConverter &typeConverter, RewritePatternSet &patterns) {
63  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
64  // NVVM uses alloca in the default address space to represent private
65  // memory allocations, so drop private annotations. NVVM uses address
66  // space 3 for shared memory. NVVM uses the default address space to
67  // represent global memory.
68  // Used in populateGpuToNVVMConversionPatternsso attaching here for now.
69  // TODO: We should have a single to_nvvm_type_converter.
71  llvmTypeConverter, [](AddressSpace space) -> unsigned {
72  switch (space) {
73  case AddressSpace::Global:
74  return static_cast<unsigned>(
76  case AddressSpace::Workgroup:
77  return static_cast<unsigned>(
79  case AddressSpace::Private:
80  return 0;
81  }
82  llvm_unreachable("unknown address space enum value");
83  return 0;
84  });
85  // Used in GPUToNVVM/WmmaOpsToNvvm.cpp so attaching here for now.
86  // TODO: We should have a single to_nvvm_type_converter.
87  llvmTypeConverter.addConversion(
88  [&](MMAMatrixType type) -> Type { return convertMMAToLLVMType(type); });
89  populateGpuToNVVMConversionPatterns(llvmTypeConverter, patterns);
90 }
91 
93 transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
94  transform::TypeConverterBuilderOpInterface builder) {
95  if (builder.getTypeConverterType() != "LLVMTypeConverter")
96  return emitOpError("expected LLVMTypeConverter");
97  return success();
98 }
99 
100 void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
101  TypeConverter &typeConverter, RewritePatternSet &patterns) {
102  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
103  populateGpuWMMAToNVVMConversionPatterns(llvmTypeConverter, patterns);
104 }
105 
107 transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
108  transform::TypeConverterBuilderOpInterface builder) {
109  if (builder.getTypeConverterType() != "LLVMTypeConverter")
110  return emitOpError("expected LLVMTypeConverter");
111  return success();
112 }
113 
114 void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
115  populatePatterns(TypeConverter &typeConverter,
116  RewritePatternSet &patterns) {
117  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
118  populateGpuSubgroupReduceOpLoweringPattern(llvmTypeConverter, patterns);
119 }
120 
121 LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
122  verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
123  if (builder.getTypeConverterType() != "LLVMTypeConverter")
124  return emitOpError("expected LLVMTypeConverter");
125  return success();
126 }
127 
128 //===----------------------------------------------------------------------===//
129 // Apply...PatternsOp
130 //===----------------------------------------------------------------------===//s
131 
132 void ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) {
133  populateGpuRewritePatterns(patterns);
134 }
135 
136 //===----------------------------------------------------------------------===//
137 // ApplyUnrollVectorsSubgroupMmaOp
138 //===----------------------------------------------------------------------===//
139 
140 /// Pick an unrolling order that will allow tensorcore operation to reuse LHS
141 /// register.
142 static std::optional<SmallVector<int64_t>>
143 gpuMmaUnrollOrder(vector::ContractionOp contract) {
144  SmallVector<int64_t> order;
145  // First make reduction the outer dimensions.
146  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
147  if (vector::isReductionIterator(iter)) {
148  order.push_back(index);
149  }
150  }
151 
152  llvm::SmallDenseSet<int64_t> dims;
153  for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
154  dims.insert(cast<AffineDimExpr>(expr).getPosition());
155  }
156  // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
157  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
158  if (vector::isParallelIterator(iter) && dims.count(index)) {
159  order.push_back(index);
160  }
161  }
162  // Then the remaining parallel loops.
163  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
164  if (vector::isParallelIterator(iter) && !dims.count(index)) {
165  order.push_back(index);
166  }
167  }
168  return order;
169 }
170 
171 /// Returns the target vector size for the target operation based on the native
172 /// vector size specified with `m`, `n`, and `k`.
173 static std::optional<SmallVector<int64_t>>
174 getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k) {
175  if (auto contract = dyn_cast<vector::ContractionOp>(op)) {
176  int64_t contractRank = contract.getIteratorTypes().size();
177  if (contractRank < 3)
178  return std::nullopt;
179  SmallVector<int64_t> nativeSize(contractRank - 3, 1);
180  nativeSize.append({m, n, k});
181  return nativeSize;
182  }
183  if (auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
184  int64_t writeRank = writeOp.getVectorType().getRank();
185  if (writeRank < 2)
186  return std::nullopt;
187  SmallVector<int64_t> nativeSize(writeRank - 2, 1);
188  nativeSize.append({m, n});
189  return nativeSize;
190  }
191  if (auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
192  // Transfer read ops may need different shapes based on how they are being
193  // used. For simplicity just match the shape used by the extract strided op.
194  VectorType sliceType;
195  for (Operation *users : op->getUsers()) {
196  auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
197  if (!extract)
198  return std::nullopt;
199  auto vecType = extract.getResult().getType().cast<VectorType>();
200  if (sliceType && sliceType != vecType)
201  return std::nullopt;
202  sliceType = vecType;
203  }
204  return llvm::to_vector(sliceType.getShape());
205  }
206  if ((OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)) {
207  if (auto vecType = op->getResultTypes()[0].dyn_cast<VectorType>()) {
208  // TODO: The condition for unrolling elementwise should be restricted
209  // only to operations that need unrolling (connected to the contract).
210  if (vecType.getRank() < 2)
211  return std::nullopt;
212 
213  // First check whether there is a slice to infer the shape from. This is
214  // required for cases where the accumulator type differs from the input
215  // types, in which case we will see an `arith.ext_` between the contract
216  // and transfer_read which needs to be unrolled.
217  VectorType sliceType;
218  for (Operation *users : op->getUsers()) {
219  auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
220  if (!extract)
221  return std::nullopt;
222  auto vecType = extract.getResult().getType().cast<VectorType>();
223  if (sliceType && sliceType != vecType)
224  return std::nullopt;
225  sliceType = vecType;
226  }
227  if (sliceType)
228  return llvm::to_vector(sliceType.getShape());
229 
230  // Else unroll for trailing elementwise.
231  SmallVector<int64_t> nativeSize(vecType.getRank() - 2, 1);
232  // Map elementwise ops to the output shape.
233  nativeSize.append({m, n});
234  return nativeSize;
235  }
236  }
237  return std::nullopt;
238 }
239 
240 void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
241  RewritePatternSet &patterns) {
242  auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> {
243  auto contract = dyn_cast<vector::ContractionOp>(op);
244  if (!contract)
245  return std::nullopt;
246  return gpuMmaUnrollOrder(contract);
247  };
248 
249  int64_t m = getM();
250  int64_t n = getN();
251  int64_t k = getK();
252  auto nativeShapeFn =
253  [m, n, k](Operation *op) -> std::optional<SmallVector<int64_t>> {
254  return getSubgroupMmaNativeVectorSize(op, m, n, k);
255  };
257  patterns, vector::UnrollVectorOptions()
258  .setNativeShapeFn(nativeShapeFn)
259  .setUnrollTraversalOrderFn(unrollOrder));
260 }
261 
262 //===----------------------------------------------------------------------===//
263 // EliminateBarriersOp
264 //===----------------------------------------------------------------------===//
265 
266 void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
268 }
269 
270 //===----------------------------------------------------------------------===//
271 // Block and thread mapping utilities.
272 //===----------------------------------------------------------------------===//
273 
274 namespace {
275 /// Local types used for mapping verification.
276 struct MappingKind {};
277 struct BlockMappingKind : MappingKind {};
278 struct ThreadMappingKind : MappingKind {};
279 } // namespace
280 
282 definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
283  Operation *target, const Twine &message) {
284  if (transformOp.has_value())
285  return transformOp->emitDefiniteFailure() << message;
286  return emitDefiniteFailure(target, message);
287 }
288 
289 /// Check if given mapping attributes are one of the desired attributes
290 template <typename MappingKindType>
292 checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
293  scf::ForallOp forallOp) {
294  if (!forallOp.getMapping().has_value()) {
295  return definiteFailureHelper(transformOp, forallOp,
296  "scf.forall op requires a mapping attribute");
297  }
298 
299  bool hasBlockMapping =
300  llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
301  return isa<GPUBlockMappingAttr>(attr);
302  });
303  bool hasWarpgroupMapping =
304  llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
305  return isa<GPUWarpgroupMappingAttr>(attr);
306  });
307  bool hasWarpMapping =
308  llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
309  return isa<GPUWarpMappingAttr>(attr);
310  });
311  bool hasThreadMapping =
312  llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
313  return isa<GPUThreadMappingAttr>(attr);
314  });
315  int64_t countMappingTypes = 0;
316  countMappingTypes += hasBlockMapping ? 1 : 0;
317  countMappingTypes += hasWarpgroupMapping ? 1 : 0;
318  countMappingTypes += hasWarpMapping ? 1 : 0;
319  countMappingTypes += hasThreadMapping ? 1 : 0;
320  if (countMappingTypes > 1) {
321  return definiteFailureHelper(
322  transformOp, forallOp,
323  "cannot mix different mapping types, use nesting");
324  }
325  if (std::is_same<MappingKindType, BlockMappingKind>::value &&
326  !hasBlockMapping) {
327  return definiteFailureHelper(
328  transformOp, forallOp,
329  "scf.forall op requires a mapping attribute of kind 'block'");
330  }
331  if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
332  !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
333  return definiteFailureHelper(transformOp, forallOp,
334  "scf.forall op requires a mapping attribute "
335  "of kind 'thread' or 'warp'");
336  }
337 
338  DenseSet<Attribute> seen;
339  for (Attribute map : forallOp.getMapping()->getValue()) {
340  if (seen.contains(map)) {
341  return definiteFailureHelper(
342  transformOp, forallOp,
343  "duplicate attribute, cannot map different loops "
344  "to the same mapping id");
345  }
346  seen.insert(map);
347  }
348 
349  auto isLinear = [](Attribute a) {
350  return cast<DeviceMappingAttrInterface>(a).isLinearMapping();
351  };
352  if (llvm::any_of(forallOp.getMapping()->getValue(), isLinear) &&
353  !llvm::all_of(forallOp.getMapping()->getValue(), isLinear)) {
354  return definiteFailureHelper(
355  transformOp, forallOp,
356  "cannot mix linear and non-linear mapping modes");
357  }
358 
360 }
361 
362 template <typename MappingKindType>
364 verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
365  scf::ForallOp forallOp) {
366  // Check the types of the mapping attributes match.
368  checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
369  if (!typeRes.succeeded())
370  return typeRes;
371 
372  // Perform other non-types verifications.
373  if (!forallOp.isNormalized())
374  return definiteFailureHelper(transformOp, forallOp,
375  "unsupported non-normalized loops");
376  if (forallOp.getNumResults() > 0)
377  return definiteFailureHelper(transformOp, forallOp,
378  "only bufferized scf.forall can be mapped");
379  bool useLinearMapping = cast<DeviceMappingAttrInterface>(
380  forallOp.getMapping()->getValue().front())
381  .isLinearMapping();
382  // TODO: This would be more natural with support for Optional<EnumParameter>
383  // in GPUDeviceMappingAttr.
384  int64_t maxNumMappingsSupported =
385  useLinearMapping ? (getMaxEnumValForMappingId() -
386  static_cast<uint64_t>(MappingId::DimZ))
387  : 3;
388  if (forallOp.getRank() > maxNumMappingsSupported) {
389  return definiteFailureHelper(transformOp, forallOp,
390  "scf.forall with rank > ")
391  << maxNumMappingsSupported
392  << " does not lower for the specified mapping attribute type";
393  }
394  auto numParallelIterations =
395  getConstantIntValues(forallOp.getMixedUpperBound());
396  if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
397  return definiteFailureHelper(
398  transformOp, forallOp,
399  "requires statically sized, normalized forall op");
400  }
402 }
403 
404 /// Struct to return the result of the rewrite of a forall operation.
408 };
409 
410 /// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
411 template <typename OpTy, typename OperationOrBlock>
412 static void
414  OperationOrBlock *parent, Value replacement,
415  ArrayRef<int64_t> availableMappingSizes) {
416  parent->walk([&](OpTy idOp) {
417  if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)
418  rewriter.replaceAllUsesWith(idOp.getResult(), replacement);
419  });
420 }
421 
423  RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
424  scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
425  ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) {
426  LDBG("--start rewriteOneForallCommonImpl");
427 
428  // Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
429  auto numParallelIterations =
430  getConstantIntValues(forallOp.getMixedUpperBound());
431  assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
432  "requires statically sized, normalized forall op");
433  SmallVector<int64_t> tmpMappingSizes = numParallelIterations.value();
434  SetVector<Attribute> forallMappingAttrs;
435  forallMappingAttrs.insert(forallOp.getMapping()->getValue().begin(),
436  forallOp.getMapping()->getValue().end());
437  auto comparator = [](Attribute a, Attribute b) -> bool {
438  return cast<DeviceMappingAttrInterface>(a).getMappingId() <
439  cast<DeviceMappingAttrInterface>(b).getMappingId();
440  };
441 
442  // Step 1.b. In the linear case, compute the max mapping to avoid needlessly
443  // mapping all dimensions. In the 3-D mapping case we need to map all
444  // dimensions.
445  DeviceMappingAttrInterface maxMapping =
446  cast<DeviceMappingAttrInterface>(*std::max_element(
447  forallMappingAttrs.begin(), forallMappingAttrs.end(), comparator));
448  DeviceMappingAttrInterface maxLinearMapping;
449  if (maxMapping.isLinearMapping())
450  maxLinearMapping = maxMapping;
451  for (auto attr : gpuIdBuilder.mappingAttributes) {
452  // If attr overflows, just skip.
453  if (maxLinearMapping && comparator(maxLinearMapping, attr))
454  continue;
455  // Try to insert. If element was already present, just continue.
456  if (!forallMappingAttrs.insert(attr))
457  continue;
458  // Otherwise, we have a new insertion without a size -> use size 1.
459  tmpMappingSizes.push_back(1);
460  }
461  LLVM_DEBUG(
462  llvm::interleaveComma(
463  tmpMappingSizes,
464  DBGS() << "----tmpMappingSizes extracted from scf.forall op: ");
465  llvm::dbgs() << "\n");
466 
467  // Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
468  SmallVector<int64_t> forallMappingSizes = getValuesSortedByKey(
469  forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
470  LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,
471  DBGS() << "----forallMappingSizes: ");
472  llvm::dbgs() << "\n"; llvm::interleaveComma(
473  forallMappingAttrs, DBGS() << "----forallMappingAttrs: ");
474  llvm::dbgs() << "\n");
475 
476  // Step 3. Generate the mappingIdOps using the provided generator.
477  Location loc = forallOp.getLoc();
478  OpBuilder::InsertionGuard guard(rewriter);
479  rewriter.setInsertionPoint(forallOp);
480  SmallVector<int64_t> originalBasis(availableMappingSizes);
481  bool originalBasisWasProvided = !originalBasis.empty();
482  if (!originalBasisWasProvided) {
483  originalBasis = forallMappingSizes;
484  while (originalBasis.size() < 3)
485  originalBasis.push_back(1);
486  }
487 
488  IdBuilderResult builderResult =
489  gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
490 
491  // Step 4. Map the induction variables to the mappingIdOps, this may involve
492  // a permutation.
493  SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;
494  IRMapping bvm;
495  for (auto [iv, dim] : llvm::zip_equal(
496  forallOp.getInductionVars(),
497  forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
498  auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
499  Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
500  bvm.map(iv, peIdOp);
501  }
502 
503  // Step 5. If the originalBasis is already known, create conditionals to
504  // predicate the region. Otherwise, the current forall determines the
505  // originalBasis and no predication occurs.
506  Value predicate;
507  if (originalBasisWasProvided) {
508  SmallVector<int64_t> activeMappingSizes = builderResult.activeMappingSizes;
509  SmallVector<int64_t> availableMappingSizes =
510  builderResult.availableMappingSizes;
511  SmallVector<Value> activeIdOps = builderResult.activeIdOps;
512  // clang-format off
513  LLVM_DEBUG(
514  llvm::interleaveComma(
515  activeMappingSizes, DBGS() << "----activeMappingSizes: ");
516  llvm::dbgs() << "\n";
517  llvm::interleaveComma(
518  availableMappingSizes, DBGS() << "----availableMappingSizes: ");
519  llvm::dbgs() << "\n";
520  llvm::interleaveComma(activeIdOps, DBGS() << "----activeIdOps: ");
521  llvm::dbgs() << "\n");
522  // clang-format on
523  for (auto [activeId, activeMappingSize, availableMappingSize] :
524  llvm::zip_equal(activeIdOps, activeMappingSizes,
525  availableMappingSizes)) {
526  if (activeMappingSize > availableMappingSize) {
527  return definiteFailureHelper(
528  transformOp, forallOp,
529  "Trying to map to fewer GPU threads than loop iterations but "
530  "overprovisioning is not yet supported. "
531  "Try additional tiling of the before mapping or map to more "
532  "threads.");
533  }
534  if (activeMappingSize == availableMappingSize)
535  continue;
536  Value idx =
537  rewriter.create<arith::ConstantIndexOp>(loc, activeMappingSize);
538  Value tmpPredicate = rewriter.create<arith::CmpIOp>(
539  loc, arith::CmpIPredicate::ult, activeId, idx);
540  LDBG("----predicate: " << tmpPredicate);
541  predicate = predicate ? rewriter.create<arith::AndIOp>(loc, predicate,
542  tmpPredicate)
543  : tmpPredicate;
544  }
545  }
546 
547  // Step 6. Move the body of forallOp.
548  // Erase the terminator first, it will not be used.
549  rewriter.eraseOp(forallOp.getTerminator());
550  Block *targetBlock;
551  Block::iterator insertionPoint;
552  if (predicate) {
553  // Step 6.a. If predicated, move at the beginning.
554  auto ifOp = rewriter.create<scf::IfOp>(loc, predicate,
555  /*withElseRegion=*/false);
556  targetBlock = ifOp.thenBlock();
557  insertionPoint = ifOp.thenBlock()->begin();
558  } else {
559  // Step 6.b. Otherwise, move inline just at the rewriter insertion
560  // point.
561  targetBlock = forallOp->getBlock();
562  insertionPoint = rewriter.getInsertionPoint();
563  }
564  Block &sourceBlock = forallOp.getRegion().front();
565  targetBlock->getOperations().splice(insertionPoint,
566  sourceBlock.getOperations());
567 
568  // Step 7. RAUW indices.
569  for (Value loopIndex : forallOp.getInductionVars()) {
570  Value threadIdx = bvm.lookup(loopIndex);
571  rewriter.replaceAllUsesWith(loopIndex, threadIdx);
572  }
573 
574  // Step 8. Erase old op.
575  rewriter.eraseOp(forallOp);
576 
577  LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,
578  DBGS() << "----result forallMappingSizes: ");
579  llvm::dbgs() << "\n"; llvm::interleaveComma(
580  mappingIdOps, DBGS() << "----result mappingIdOps: ");
581  llvm::dbgs() << "\n");
582 
583  result = ForallRewriteResult{forallMappingSizes, mappingIdOps};
585 }
586 
587 //===----------------------------------------------------------------------===//
588 // MapForallToBlocks
589 //===----------------------------------------------------------------------===//
590 
592  RewriterBase &rewriter, TransformOpInterface transformOp,
593  scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
594  const GpuIdBuilder &gpuIdBuilder) {
595  LDBG("Start mapForallToBlocksImpl");
596 
597  {
598  // GPU-specific verifications. There is no better place to anchor
599  // those right now: the ForallOp is target-independent and the transform
600  // op does not apply to individual ForallOp.
602  verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
603  if (!diag.succeeded())
604  return diag;
605  }
606 
607  Location loc = forallOp.getLoc();
608  Block *parentBlock = forallOp->getBlock();
609  Value zero;
610  {
611  // Create an early zero index value for replacements and immediately reset
612  // the insertion point.
613  OpBuilder::InsertionGuard guard(rewriter);
614  rewriter.setInsertionPointToStart(parentBlock);
615  zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
616  }
617 
618  ForallRewriteResult rewriteResult;
620  rewriter, transformOp, forallOp,
621  /*availableMappingSizes=*/gridDims, rewriteResult, gpuIdBuilder);
622 
623  // Return if anything goes wrong, use silenceable failure as a match
624  // failure.
625  if (!diag.succeeded())
626  return diag;
627 
628  // If gridDims was not provided already, set it from the return.
629  if (gridDims.empty()) {
630  gridDims = rewriteResult.mappingSizes;
631  while (gridDims.size() < 3)
632  gridDims.push_back(1);
633  }
634  assert(gridDims.size() == 3 && "Need 3-D gridDims");
635 
636  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
637  // Here, the result of mapping determines the available mapping sizes.
638  replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
639  rewriteResult.mappingSizes);
640 
642 }
643 
646  scf::ForallOp &topLevelForallOp,
647  TransformOpInterface transformOp) {
648  auto walkResult = target->walk([&](scf::ForallOp forallOp) {
649  if (forallOp->getParentOfType<scf::ForallOp>())
650  return WalkResult::advance();
651  if (topLevelForallOp)
652  // TODO: Handle multiple forall if they are independent.
653  return WalkResult::interrupt();
654  topLevelForallOp = forallOp;
655  return WalkResult::advance();
656  });
657 
658  if (walkResult.wasInterrupted() || !topLevelForallOp)
659  return transformOp.emitSilenceableError()
660  << "could not find a unique topLevel scf.forall";
662 }
663 
664 DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
665  transform::TransformRewriter &rewriter, Operation *target,
667  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
668  auto transformOp = cast<TransformOpInterface>(getOperation());
669 
670  if (!getGenerateGpuLaunch() && !gpuLaunch) {
672  emitSilenceableError()
673  << "Given target is not gpu.launch, set `generate_gpu_launch` "
674  "attribute";
675  diag.attachNote(target->getLoc()) << "when applied to this payload op";
676  return diag;
677  }
678 
679  scf::ForallOp topLevelForallOp;
681  target, topLevelForallOp, transformOp);
682  if (!diag.succeeded()) {
683  diag.attachNote(target->getLoc()) << "when applied to this payload op";
684  return diag;
685  }
686  assert(topLevelForallOp && "expect an scf.forall");
687 
688  SmallVector<int64_t> gridDims{getGridDims()};
689  if (!getGenerateGpuLaunch() && gridDims.size() != 3)
690  return transformOp.emitDefiniteFailure("transform require size-3 mapping");
691 
692  OpBuilder::InsertionGuard guard(rewriter);
693  rewriter.setInsertionPoint(topLevelForallOp);
694 
695  // Generate gpu launch here and move the forall inside
696  if (getGenerateGpuLaunch()) {
698  createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
699  if (!diag.succeeded())
700  return diag;
701 
702  rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
703  Operation *newForallOp = rewriter.clone(*topLevelForallOp);
704  rewriter.eraseOp(topLevelForallOp);
705  topLevelForallOp = cast<scf::ForallOp>(newForallOp);
706  }
707 
708  // The BlockIdBuilder adapts to whatever is thrown at it.
709  bool useLinearMapping = false;
710  if (topLevelForallOp.getMapping()) {
711  auto mappingAttr = cast<DeviceMappingAttrInterface>(
712  topLevelForallOp.getMapping()->getValue().front());
713  useLinearMapping = mappingAttr.isLinearMapping();
714  }
715  GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping);
716 
718  rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
719  if (!diag.succeeded())
720  return diag;
721 
722  // Set the GPU launch configuration for the grid dims late, this is
723  // subject to IR inspection.
724  diag = alterGpuLaunch(rewriter, gpuLaunch,
725  cast<TransformOpInterface>(getOperation()), gridDims[0],
726  gridDims[1], gridDims[2]);
727 
728  results.push_back(gpuLaunch);
729  return diag;
730 }
731 
733  if (!getGridDims().empty() && getGridDims().size() != 3) {
734  return emitOpError() << "transform requires empty or size-3 grid_dims";
735  }
736  return success();
737 }
738 
739 //===----------------------------------------------------------------------===//
740 // MapNestedForallToThreads
741 //===----------------------------------------------------------------------===//
742 
744  std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
745  ArrayRef<int64_t> numParallelIterations, ArrayRef<int64_t> blockOrGridSizes,
746  int factor, bool useLinearMapping = false) {
747  if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
749  transformOp, forallOp,
750  Twine("3-D mapping: size of threadIdx.x must be a multiple of ") +
751  std::to_string(factor));
752  return diag;
753  }
754  if (computeProduct(numParallelIterations) * factor >
755  computeProduct(blockOrGridSizes)) {
757  transformOp, forallOp,
758  Twine("the number of required parallel resources (blocks or "
759  "threads) ") +
760  std::to_string(computeProduct(numParallelIterations) * factor) +
761  std::string(" overflows the number of available resources ") +
762  std::to_string(computeProduct(blockOrGridSizes)));
763  return diag;
764  }
766 }
767 
769 getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
770  scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes,
771  int64_t warpSize, GpuIdBuilder &gpuIdBuilder) {
772  auto mappingAttr = cast<DeviceMappingAttrInterface>(
773  forallOp.getMapping()->getValue().front());
774  bool useLinearMapping = mappingAttr.isLinearMapping();
775 
776  // Sanity checks that may result in runtime verification errors.
777  auto numParallelIterations =
778  getConstantIntValues((forallOp.getMixedUpperBound()));
779  if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
780  return definiteFailureHelper(
781  transformOp, forallOp,
782  "requires statically sized, normalized forall op");
783  }
784  int64_t factor = 1;
785  if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
786  factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize;
787  } else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
788  factor = warpSize;
789  }
791  checkMappingSpec(transformOp, forallOp, numParallelIterations.value(),
792  blockSizes, factor, useLinearMapping);
793  if (!diag.succeeded())
794  return diag;
795 
796  // Start mapping.
797  MLIRContext *ctx = forallOp.getContext();
798  gpuIdBuilder =
800  .Case([&](GPUWarpgroupMappingAttr) {
801  return GpuWarpgroupIdBuilder(ctx, warpSize, useLinearMapping);
802  })
803  .Case([&](GPUWarpMappingAttr) {
804  return GpuWarpIdBuilder(ctx, warpSize, useLinearMapping);
805  })
806  .Case([&](GPUThreadMappingAttr) {
807  return GpuThreadIdBuilder(ctx, useLinearMapping);
808  })
809  .Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder {
810  llvm_unreachable("unknown mapping attribute");
811  });
813 }
814 
816  RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
817  scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, int64_t warpSize,
818  bool syncAfterDistribute) {
819 
820  {
821  // GPU-specific verifications. There is no better place to anchor
822  // those right now: the ForallOp is target-independent and the transform
823  // op does not apply to individual ForallOp.
825  verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
826  if (!diag.succeeded())
827  return diag;
828  }
829 
830  GpuIdBuilder gpuIdBuilder;
831  {
832  // Try to construct the id builder, if it fails, return.
834  transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
835  if (!diag.succeeded())
836  return diag;
837  }
838 
839  Location loc = forallOp.getLoc();
840  OpBuilder::InsertionGuard g(rewriter);
841  // Insert after to allow for syncthreads after `forall` is erased.
842  rewriter.setInsertionPointAfter(forallOp);
843  ForallRewriteResult rewriteResult;
845  rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
846  if (!diag.succeeded())
847  return diag;
848  // Add a syncthreads if needed. TODO: warpsync
849  if (syncAfterDistribute)
850  rewriter.create<BarrierOp>(loc);
851 
853 }
854 
856  RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
857  Operation *target, ArrayRef<int64_t> blockDims, int64_t warpSize,
858  bool syncAfterDistribute) {
859  LDBG("Start mapNestedForallToThreadsImpl");
860  if (blockDims.size() != 3) {
861  return definiteFailureHelper(transformOp, target,
862  "requires size-3 thread mapping");
863  }
864 
865  // Create an early zero index value for replacements.
866  Location loc = target->getLoc();
867  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
869  WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
871  rewriter, transformOp, forallOp, blockDims, warpSize,
872  syncAfterDistribute);
873  if (diag.isDefiniteFailure())
874  return WalkResult::interrupt();
875  if (diag.succeeded())
876  return WalkResult::skip();
877  return WalkResult::advance();
878  });
879  if (walkResult.wasInterrupted())
880  return diag;
881 
882  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
883  // Here, the result of mapping determines the available mapping sizes.
884  replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
885  blockDims);
886 
888 }
889 
890 DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
891  transform::TransformRewriter &rewriter, Operation *target,
892  ApplyToEachResultList &results, TransformState &state) {
893  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
894  auto transformOp = cast<TransformOpInterface>(getOperation());
895 
896  // Basic high-level verifications.
897  if (!gpuLaunch)
898  return emitSilenceableError() << "Given target is not a gpu.launch";
899 
900  // Mapping to block ids.
901  SmallVector<int64_t> blockDims{getBlockDims()};
903  checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
904  blockDims[0], blockDims[1], blockDims[2]);
905  if (diag.isSilenceableFailure()) {
906  diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";
907  return diag;
908  }
909 
910  // Set the GPU launch configuration for the block dims early, this is not
911  // subject to IR inspection.
912  diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,
913  std::nullopt, std::nullopt, blockDims[0], blockDims[1],
914  blockDims[2]);
915 
916  rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
917  diag =
918  mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims,
919  getWarpSize(), getSyncAfterDistribute());
920 
921  results.push_back(gpuLaunch.getOperation());
922  return diag;
923 }
924 
925 //===----------------------------------------------------------------------===//
926 // Transform op registration
927 //===----------------------------------------------------------------------===//
928 
929 namespace {
930 /// Registers new ops and declares PDL as dependent dialect since the
931 /// additional ops are using PDL types for operands and results.
932 class GPUTransformDialectExtension
934  GPUTransformDialectExtension> {
935 public:
936  GPUTransformDialectExtension() {
937  declareGeneratedDialect<scf::SCFDialect>();
938  declareGeneratedDialect<arith::ArithDialect>();
939  declareGeneratedDialect<GPUDialect>();
940  registerTransformOps<
941 #define GET_OP_LIST
942 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
943  >();
944  }
945 };
946 } // namespace
947 
948 #define GET_OP_CLASSES
949 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
950 
952  registry.addExtensions<GPUTransformDialectExtension>();
953 }
static constexpr int64_t kSharedMemorySpace
static DiagnosedSilenceableFailure checkMappingAttributeTypes(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
Check if given mapping attributes are one of the desired attributes.
static std::optional< SmallVector< int64_t > > getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k)
Returns the target vector size for the target operation based on the native vector size specified wit...
static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > availableMappingSizes, ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder)
static DiagnosedSilenceableFailure definiteFailureHelper(std::optional< TransformOpInterface > transformOp, Operation *target, const Twine &message)
static DiagnosedSilenceableFailure checkMappingSpec(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > numParallelIterations, ArrayRef< int64_t > blockOrGridSizes, int factor, bool useLinearMapping=false)
static void replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc, OperationOrBlock *parent, Value replacement, ArrayRef< int64_t > availableMappingSizes)
Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
static std::optional< SmallVector< int64_t > > gpuMmaUnrollOrder(vector::ContractionOp contract)
Pick an unrolling order that will allow tensorcore operation to reuse LHS register.
#define DBGS()
static DiagnosedSilenceableFailure verifyGpuMapping(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
#define LDBG(X)
static DiagnosedSilenceableFailure getThreadIdBuilder(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockSizes, int64_t warpSize, GpuIdBuilder &gpuIdBuilder)
static MLIRContext * getContext(OpFoldResult val)
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
Base type for affine expression.
Definition: AffineExpr.h:69
Attributes are known-constant values of operations.
Definition: Attributes.h:25
Block represents an ordered list of Operations.
Definition: Block.h:30
OpListType::iterator iterator
Definition: Block.h:137
OpListType & getOperations()
Definition: Block.h:134
Operation & front()
Definition: Block.h:150
iterator begin()
Definition: Block.h:140
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:34
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:350
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Definition: Builders.h:447
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:553
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:433
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:400
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:464
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:414
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:793
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
result_type_range getResultTypes()
Definition: Operation.h:423
user_range getUsers()
Returns a range of all users.
Definition: Operation.h:869
unsigned getNumResults()
Return the number of results held by this operation.
Definition: Operation.h:399
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:399
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:640
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Type conversion class.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
A utility result that is used to signal how to proceed with an ongoing walk:
Definition: Visitors.h:34
static WalkResult skip()
Definition: Visitors.h:53
static WalkResult advance()
Definition: Visitors.h:52
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition: Visitors.h:56
static WalkResult interrupt()
Definition: Visitors.h:51
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:90
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Definition: GPUDialect.h:130
A list of results of applying a transform op with ApplyEachOpTrait to a single payload operation,...
void push_back(Operation *op)
Appends an element to the list.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
This is a special rewriter to be used in transform op implementations, providing additional helper fu...
The state maintained across applications of various ops implementing the TransformOpInterface.
@ kGlobalMemorySpace
Global memory space identifier.
Definition: NVVMDialect.h:36
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
Definition: Operation.cpp:1375
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:285
void registerTransformDialectExtension(DialectRegistry &registry)
uint64_t getN(LevelType lt)
Definition: Enums.h:434
uint64_t getM(LevelType lt)
Definition: Enums.h:435
DiagnosedSilenceableFailure mapOneForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockDims, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
DiagnosedSilenceableFailure findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp, TransformOpInterface transformOp)
Find the unique top level scf::ForallOp within a given target op.
DiagnosedSilenceableFailure alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Alter kernel configuration of the given kernel.
DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Create an empty-body gpu::LaunchOp using the provided kernel settings and put a terminator within.
DiagnosedSilenceableFailure mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp, scf::ForallOp forallOp, SmallVectorImpl< int64_t > &gridDims, const GpuIdBuilder &gpuIdBuilder)
Map the top level scf.forall op to GPU blocks.
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional< int64_t > gridDimX, std::optional< int64_t > gridDimY, std::optional< int64_t > gridDimZ, std::optional< int64_t > blockDimX, std::optional< int64_t > blockDimY, std::optional< int64_t > blockDimZ)
Determine if the size of the kernel configuration is supported by the GPU architecture being used.
Definition: Utils.cpp:232
DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, Operation *target, ArrayRef< int64_t > blockDims, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
Definition: VectorOps.h:140
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Definition: VectorOps.h:135
void populateVectorUnrollPatterns(RewritePatternSet &patterns, const UnrollVectorOptions &options, PatternBenefit benefit=1)
Collect a set of pattern to unroll vector operations to a smaller shapes.
Include the generated interface declarations.
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:81
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert from the GPU dialect to NVVM.
void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate GpuSubgroupReduce pattern to NVVM.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
LogicalResult verify(Operation *op, bool verifyRecursively=true)
Perform (potentially expensive) checks of invariants, used to detect compiler bugs,...
Definition: Verifier.cpp:421
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
Builder for gpu::BlockIdOps used to map scf.forall to blocks.
Definition: Utils.h:83
Helper struct for configuring the rewrite of mapped scf.forall ops to various gpu id configurations.
Definition: Utils.h:63
SmallVector< DeviceMappingAttrInterface > mappingAttributes
The mapping attributes targeted by this generator.
Definition: Utils.h:72
GpuIdBuilderFnType idBuilder
The constructor that builds the concrete IR for mapping ids.
Definition: Utils.h:75
Builder for warp ids used to map scf.forall to reindexed threads.
Definition: Utils.h:116
Builder for warp ids used to map scf.forall to reindexed warps.
Definition: Utils.h:105
Builder for warpgroup ids used to map scf.forall to reindexed warpgroups.
Definition: Utils.h:92
Helper type for functions that generate ids for the mapping of a scf.forall.
Definition: Utils.h:38
SmallVector< int64_t > availableMappingSizes
Definition: Utils.h:43
SmallVector< Value > mappingIdOps
Definition: Utils.h:40
SmallVector< Value > activeIdOps
Definition: Utils.h:49
SmallVector< int64_t > activeMappingSizes
Definition: Utils.h:46
Options that control the vector unrolling.