MLIR  21.0.0git
GPUTransformOps.cpp
Go to the documentation of this file.
1 //===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
29 #include "mlir/IR/AffineExpr.h"
30 #include "mlir/IR/Builders.h"
32 #include "mlir/IR/IRMapping.h"
33 #include "mlir/IR/MLIRContext.h"
34 #include "mlir/IR/OpDefinition.h"
35 #include "mlir/IR/Visitors.h"
36 #include "mlir/Support/LLVM.h"
38 #include "llvm/ADT/STLExtras.h"
39 #include "llvm/ADT/SmallVector.h"
40 #include "llvm/ADT/TypeSwitch.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/InterleavedRange.h"
44 #include <type_traits>
45 
46 using namespace mlir;
47 using namespace mlir::gpu;
48 using namespace mlir::transform;
49 using namespace mlir::transform::gpu;
50 
51 #define DEBUG_TYPE "gpu-transforms"
52 #define DEBUG_TYPE_ALIAS "gpu-transforms-alias"
53 
54 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
55 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
56 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
57 
58 //===----------------------------------------------------------------------===//
59 // Apply...ConversionPatternsOp
60 //===----------------------------------------------------------------------===//
61 
62 void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
63  TypeConverter &typeConverter, RewritePatternSet &patterns) {
64  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
65  // NVVM uses alloca in the default address space to represent private
66  // memory allocations, so drop private annotations. NVVM uses address
67  // space 3 for shared memory. NVVM uses the default address space to
68  // represent global memory.
69  // Used in populateGpuToNVVMConversionPatternsso attaching here for now.
70  // TODO: We should have a single to_nvvm_type_converter.
72  llvmTypeConverter, [](AddressSpace space) -> unsigned {
73  switch (space) {
74  case AddressSpace::Global:
75  return static_cast<unsigned>(
77  case AddressSpace::Workgroup:
78  return static_cast<unsigned>(
80  case AddressSpace::Private:
81  return 0;
82  }
83  llvm_unreachable("unknown address space enum value");
84  return 0;
85  });
86  // Used in GPUToNVVM/WmmaOpsToNvvm.cpp so attaching here for now.
87  // TODO: We should have a single to_nvvm_type_converter.
88  llvmTypeConverter.addConversion(
89  [&](MMAMatrixType type) -> Type { return convertMMAToLLVMType(type); });
90  // Set higher benefit, so patterns will run before generic LLVM lowering.
92  getBenefit());
93 }
94 
95 LogicalResult
96 transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
97  transform::TypeConverterBuilderOpInterface builder) {
98  if (builder.getTypeConverterType() != "LLVMTypeConverter")
99  return emitOpError("expected LLVMTypeConverter");
100  return success();
101 }
102 
103 void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
104  TypeConverter &typeConverter, RewritePatternSet &patterns) {
105  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
107 }
108 
109 LogicalResult
110 transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
111  transform::TypeConverterBuilderOpInterface builder) {
112  if (builder.getTypeConverterType() != "LLVMTypeConverter")
113  return emitOpError("expected LLVMTypeConverter");
114  return success();
115 }
116 
117 void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
118  populatePatterns(TypeConverter &typeConverter,
120  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
122 }
123 
124 LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
125  verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
126  if (builder.getTypeConverterType() != "LLVMTypeConverter")
127  return emitOpError("expected LLVMTypeConverter");
128  return success();
129 }
130 
131 //===----------------------------------------------------------------------===//
132 // Apply...PatternsOp
133 //===----------------------------------------------------------------------===//s
134 
135 void ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) {
137 }
138 
139 //===----------------------------------------------------------------------===//
140 // ApplyUnrollVectorsSubgroupMmaOp
141 //===----------------------------------------------------------------------===//
142 
143 /// Pick an unrolling order that will allow tensorcore operation to reuse LHS
144 /// register.
145 static std::optional<SmallVector<int64_t>>
146 gpuMmaUnrollOrder(vector::ContractionOp contract) {
147  SmallVector<int64_t> order;
148  // First make reduction the outer dimensions.
149  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
150  if (vector::isReductionIterator(iter)) {
151  order.push_back(index);
152  }
153  }
154 
155  llvm::SmallDenseSet<int64_t> dims;
156  for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
157  dims.insert(cast<AffineDimExpr>(expr).getPosition());
158  }
159  // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
160  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
161  if (vector::isParallelIterator(iter) && dims.count(index)) {
162  order.push_back(index);
163  }
164  }
165  // Then the remaining parallel loops.
166  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
167  if (vector::isParallelIterator(iter) && !dims.count(index)) {
168  order.push_back(index);
169  }
170  }
171  return order;
172 }
173 
174 /// Returns the target vector size for the target operation based on the native
175 /// vector size specified with `m`, `n`, and `k`.
176 static std::optional<SmallVector<int64_t>>
177 getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k) {
178  if (auto contract = dyn_cast<vector::ContractionOp>(op)) {
179  int64_t contractRank = contract.getIteratorTypes().size();
180  if (contractRank < 3)
181  return std::nullopt;
182  SmallVector<int64_t> nativeSize(contractRank - 3, 1);
183  nativeSize.append({m, n, k});
184  return nativeSize;
185  }
186  if (auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
187  int64_t writeRank = writeOp.getVectorType().getRank();
188  if (writeRank < 2)
189  return std::nullopt;
190  SmallVector<int64_t> nativeSize(writeRank - 2, 1);
191  nativeSize.append({m, n});
192  return nativeSize;
193  }
194  if (auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
195  // Transfer read ops may need different shapes based on how they are being
196  // used. For simplicity just match the shape used by the extract strided op.
197  VectorType sliceType;
198  for (Operation *users : op->getUsers()) {
199  auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
200  if (!extract)
201  return std::nullopt;
202  auto vecType = cast<VectorType>(extract.getResult().getType());
203  if (sliceType && sliceType != vecType)
204  return std::nullopt;
205  sliceType = vecType;
206  }
207  return llvm::to_vector(sliceType.getShape());
208  }
209  if ((OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)) {
210  if (auto vecType = dyn_cast<VectorType>(op->getResultTypes()[0])) {
211  // TODO: The condition for unrolling elementwise should be restricted
212  // only to operations that need unrolling (connected to the contract).
213  if (vecType.getRank() < 2)
214  return std::nullopt;
215 
216  // First check whether there is a slice to infer the shape from. This is
217  // required for cases where the accumulator type differs from the input
218  // types, in which case we will see an `arith.ext_` between the contract
219  // and transfer_read which needs to be unrolled.
220  VectorType sliceType;
221  for (Operation *users : op->getUsers()) {
222  auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
223  if (!extract)
224  return std::nullopt;
225  auto vecType = cast<VectorType>(extract.getResult().getType());
226  if (sliceType && sliceType != vecType)
227  return std::nullopt;
228  sliceType = vecType;
229  }
230  if (sliceType)
231  return llvm::to_vector(sliceType.getShape());
232 
233  // Else unroll for trailing elementwise.
234  SmallVector<int64_t> nativeSize(vecType.getRank() - 2, 1);
235  // Map elementwise ops to the output shape.
236  nativeSize.append({m, n});
237  return nativeSize;
238  }
239  }
240  return std::nullopt;
241 }
242 
243 void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
245  auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> {
246  auto contract = dyn_cast<vector::ContractionOp>(op);
247  if (!contract)
248  return std::nullopt;
249  return gpuMmaUnrollOrder(contract);
250  };
251 
252  int64_t m = getM();
253  int64_t n = getN();
254  int64_t k = getK();
255  auto nativeShapeFn =
256  [m, n, k](Operation *op) -> std::optional<SmallVector<int64_t>> {
257  return getSubgroupMmaNativeVectorSize(op, m, n, k);
258  };
259  vector::populateVectorUnrollPatterns(
261  .setNativeShapeFn(nativeShapeFn)
262  .setUnrollTraversalOrderFn(unrollOrder));
263 }
264 
265 //===----------------------------------------------------------------------===//
266 // EliminateBarriersOp
267 //===----------------------------------------------------------------------===//
268 
269 void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
271 }
272 
273 //===----------------------------------------------------------------------===//
274 // Block and thread mapping utilities.
275 //===----------------------------------------------------------------------===//
276 
277 namespace {
278 /// Local types used for mapping verification.
279 struct MappingKind {};
280 struct BlockMappingKind : MappingKind {};
281 struct ThreadMappingKind : MappingKind {};
282 } // namespace
283 
285 definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
286  Operation *target, const Twine &message) {
287  if (transformOp.has_value())
288  return transformOp->emitDefiniteFailure() << message;
289  return emitDefiniteFailure(target, message);
290 }
291 
292 /// Check if given mapping attributes are one of the desired attributes
293 template <typename MappingKindType>
295 checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
296  scf::ForallOp forallOp) {
297  if (!forallOp.getMapping().has_value()) {
298  return definiteFailureHelper(transformOp, forallOp,
299  "scf.forall op requires a mapping attribute");
300  }
301 
302  bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
303  llvm::IsaPred<GPUBlockMappingAttr>);
304  bool hasWarpgroupMapping = llvm::any_of(
305  forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
306  bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
307  llvm::IsaPred<GPUWarpMappingAttr>);
308  bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
309  llvm::IsaPred<GPUThreadMappingAttr>);
310  int64_t countMappingTypes = 0;
311  countMappingTypes += hasBlockMapping ? 1 : 0;
312  countMappingTypes += hasWarpgroupMapping ? 1 : 0;
313  countMappingTypes += hasWarpMapping ? 1 : 0;
314  countMappingTypes += hasThreadMapping ? 1 : 0;
315  if (countMappingTypes > 1) {
316  return definiteFailureHelper(
317  transformOp, forallOp,
318  "cannot mix different mapping types, use nesting");
319  }
320  if (std::is_same<MappingKindType, BlockMappingKind>::value &&
321  !hasBlockMapping) {
322  return definiteFailureHelper(
323  transformOp, forallOp,
324  "scf.forall op requires a mapping attribute of kind 'block'");
325  }
326  if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
327  !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
328  return definiteFailureHelper(transformOp, forallOp,
329  "scf.forall op requires a mapping attribute "
330  "of kind 'thread' or 'warp'");
331  }
332 
333  DenseSet<Attribute> seen;
334  for (Attribute map : forallOp.getMapping()->getValue()) {
335  if (seen.contains(map)) {
336  return definiteFailureHelper(
337  transformOp, forallOp,
338  "duplicate attribute, cannot map different loops "
339  "to the same mapping id");
340  }
341  seen.insert(map);
342  }
343 
344  auto isLinear = [](Attribute a) {
345  return cast<DeviceMappingAttrInterface>(a).isLinearMapping();
346  };
347  if (llvm::any_of(forallOp.getMapping()->getValue(), isLinear) &&
348  !llvm::all_of(forallOp.getMapping()->getValue(), isLinear)) {
349  return definiteFailureHelper(
350  transformOp, forallOp,
351  "cannot mix linear and non-linear mapping modes");
352  }
353 
355 }
356 
357 template <typename MappingKindType>
359 verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
360  scf::ForallOp forallOp) {
361  // Check the types of the mapping attributes match.
363  checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
364  if (!typeRes.succeeded())
365  return typeRes;
366 
367  // Perform other non-types verifications.
368  if (!forallOp.isNormalized())
369  return definiteFailureHelper(transformOp, forallOp,
370  "unsupported non-normalized loops");
371  if (forallOp.getNumResults() > 0)
372  return definiteFailureHelper(transformOp, forallOp,
373  "only bufferized scf.forall can be mapped");
374  bool useLinearMapping = cast<DeviceMappingAttrInterface>(
375  forallOp.getMapping()->getValue().front())
376  .isLinearMapping();
377  // TODO: This would be more natural with support for Optional<EnumParameter>
378  // in GPUDeviceMappingAttr.
379  int64_t maxNumMappingsSupported =
380  useLinearMapping ? (getMaxEnumValForMappingId() -
381  static_cast<uint64_t>(MappingId::DimZ))
382  : 3;
383  if (forallOp.getRank() > maxNumMappingsSupported) {
384  return definiteFailureHelper(transformOp, forallOp,
385  "scf.forall with rank > ")
386  << maxNumMappingsSupported
387  << " does not lower for the specified mapping attribute type";
388  }
389  auto numParallelIterations =
390  getConstantIntValues(forallOp.getMixedUpperBound());
391  if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
392  return definiteFailureHelper(
393  transformOp, forallOp,
394  "requires statically sized, normalized forall op");
395  }
397 }
398 
399 /// Struct to return the result of the rewrite of a forall operation.
403 };
404 
405 /// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
406 template <typename OpTy, typename OperationOrBlock>
407 static void
409  OperationOrBlock *parent, Value replacement,
410  ArrayRef<int64_t> availableMappingSizes) {
411  parent->walk([&](OpTy idOp) {
412  if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)
413  rewriter.replaceAllUsesWith(idOp.getResult(), replacement);
414  });
415 }
416 
418  RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
419  scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
420  ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) {
421  LDBG("--start rewriteOneForallCommonImpl");
422 
423  // Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
424  auto numParallelIterations =
425  getConstantIntValues(forallOp.getMixedUpperBound());
426  assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
427  "requires statically sized, normalized forall op");
428  SmallVector<int64_t> tmpMappingSizes = numParallelIterations.value();
429  SetVector<Attribute> forallMappingAttrs;
430  forallMappingAttrs.insert_range(forallOp.getMapping()->getValue());
431  auto comparator = [](Attribute a, Attribute b) -> bool {
432  return cast<DeviceMappingAttrInterface>(a).getMappingId() <
433  cast<DeviceMappingAttrInterface>(b).getMappingId();
434  };
435 
436  // Step 1.b. In the linear case, compute the max mapping to avoid needlessly
437  // mapping all dimensions. In the 3-D mapping case we need to map all
438  // dimensions.
439  DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
440  *llvm::max_element(forallMappingAttrs, comparator));
441  DeviceMappingAttrInterface maxLinearMapping;
442  if (maxMapping.isLinearMapping())
443  maxLinearMapping = maxMapping;
444  for (auto attr : gpuIdBuilder.mappingAttributes) {
445  // If attr overflows, just skip.
446  if (maxLinearMapping && comparator(maxLinearMapping, attr))
447  continue;
448  // Try to insert. If element was already present, just continue.
449  if (!forallMappingAttrs.insert(attr))
450  continue;
451  // Otherwise, we have a new insertion without a size -> use size 1.
452  tmpMappingSizes.push_back(1);
453  }
454  LDBG("----tmpMappingSizes extracted from scf.forall op: "
455  << llvm::interleaved(tmpMappingSizes));
456 
457  // Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
458  SmallVector<int64_t> forallMappingSizes = getValuesSortedByKey(
459  forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
460  LDBG("----forallMappingSizes: " << llvm::interleaved(forallMappingSizes));
461  LDBG("----forallMappingAttrs: " << llvm::interleaved(forallMappingAttrs));
462 
463  // Step 3. Generate the mappingIdOps using the provided generator.
464  Location loc = forallOp.getLoc();
465  OpBuilder::InsertionGuard guard(rewriter);
466  rewriter.setInsertionPoint(forallOp);
467  SmallVector<int64_t> originalBasis(availableMappingSizes);
468  bool originalBasisWasProvided = !originalBasis.empty();
469  if (!originalBasisWasProvided) {
470  originalBasis = forallMappingSizes;
471  while (originalBasis.size() < 3)
472  originalBasis.push_back(1);
473  }
474 
475  IdBuilderResult builderResult =
476  gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
477 
478  // Step 4. Map the induction variables to the mappingIdOps, this may involve
479  // a permutation.
480  SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;
481  IRMapping bvm;
482  for (auto [iv, dim] : llvm::zip_equal(
483  forallOp.getInductionVars(),
484  forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
485  auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
486  Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
487  bvm.map(iv, peIdOp);
488  }
489 
490  // Step 5. If the originalBasis is already known, create conditionals to
491  // predicate the region. Otherwise, the current forall determines the
492  // originalBasis and no predication occurs.
493  Value predicate;
494  if (originalBasisWasProvided) {
495  SmallVector<int64_t> activeMappingSizes = builderResult.activeMappingSizes;
496  SmallVector<int64_t> availableMappingSizes =
497  builderResult.availableMappingSizes;
498  SmallVector<Value> activeIdOps = builderResult.activeIdOps;
499  LDBG("----activeMappingSizes: " << llvm::interleaved(activeMappingSizes));
500  LDBG("----availableMappingSizes: "
501  << llvm::interleaved(availableMappingSizes));
502  LDBG("----activeIdOps: " << llvm::interleaved(activeIdOps));
503  for (auto [activeId, activeMappingSize, availableMappingSize] :
504  llvm::zip_equal(activeIdOps, activeMappingSizes,
505  availableMappingSizes)) {
506  if (activeMappingSize > availableMappingSize) {
507  return definiteFailureHelper(
508  transformOp, forallOp,
509  "Trying to map to fewer GPU threads than loop iterations but "
510  "overprovisioning is not yet supported. "
511  "Try additional tiling of the before mapping or map to more "
512  "threads.");
513  }
514  if (activeMappingSize == availableMappingSize)
515  continue;
516  Value idx =
517  rewriter.create<arith::ConstantIndexOp>(loc, activeMappingSize);
518  Value tmpPredicate = rewriter.create<arith::CmpIOp>(
519  loc, arith::CmpIPredicate::ult, activeId, idx);
520  LDBG("----predicate: " << tmpPredicate);
521  predicate = predicate ? rewriter.create<arith::AndIOp>(loc, predicate,
522  tmpPredicate)
523  : tmpPredicate;
524  }
525  }
526 
527  // Step 6. Move the body of forallOp.
528  // Erase the terminator first, it will not be used.
529  rewriter.eraseOp(forallOp.getTerminator());
530  Block *targetBlock;
531  Block::iterator insertionPoint;
532  if (predicate) {
533  // Step 6.a. If predicated, move at the beginning.
534  auto ifOp = rewriter.create<scf::IfOp>(loc, predicate,
535  /*withElseRegion=*/false);
536  targetBlock = ifOp.thenBlock();
537  insertionPoint = ifOp.thenBlock()->begin();
538  } else {
539  // Step 6.b. Otherwise, move inline just at the rewriter insertion
540  // point.
541  targetBlock = forallOp->getBlock();
542  insertionPoint = rewriter.getInsertionPoint();
543  }
544  Block &sourceBlock = forallOp.getRegion().front();
545  targetBlock->getOperations().splice(insertionPoint,
546  sourceBlock.getOperations());
547 
548  // Step 7. RAUW indices.
549  for (Value loopIndex : forallOp.getInductionVars()) {
550  Value threadIdx = bvm.lookup(loopIndex);
551  rewriter.replaceAllUsesWith(loopIndex, threadIdx);
552  }
553 
554  // Step 8. Erase old op.
555  rewriter.eraseOp(forallOp);
556 
557  LDBG("----result forallMappingSizes: "
558  << llvm::interleaved(forallMappingSizes));
559  LDBG("----result mappingIdOps: " << llvm::interleaved(mappingIdOps));
560 
561  result = ForallRewriteResult{forallMappingSizes, mappingIdOps};
563 }
564 
565 //===----------------------------------------------------------------------===//
566 // MapForallToBlocks
567 //===----------------------------------------------------------------------===//
568 
570  RewriterBase &rewriter, TransformOpInterface transformOp,
571  scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
572  const GpuIdBuilder &gpuIdBuilder) {
573  LDBG("Start mapForallToBlocksImpl");
574 
575  {
576  // GPU-specific verifications. There is no better place to anchor
577  // those right now: the ForallOp is target-independent and the transform
578  // op does not apply to individual ForallOp.
580  verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
581  if (!diag.succeeded())
582  return diag;
583  }
584 
585  Location loc = forallOp.getLoc();
586  Block *parentBlock = forallOp->getBlock();
587  Value zero;
588  {
589  // Create an early zero index value for replacements and immediately reset
590  // the insertion point.
591  OpBuilder::InsertionGuard guard(rewriter);
592  rewriter.setInsertionPointToStart(parentBlock);
593  zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
594  }
595 
596  ForallRewriteResult rewriteResult;
598  rewriter, transformOp, forallOp,
599  /*availableMappingSizes=*/gridDims, rewriteResult, gpuIdBuilder);
600 
601  // Return if anything goes wrong, use silenceable failure as a match
602  // failure.
603  if (!diag.succeeded())
604  return diag;
605 
606  // If gridDims was not provided already, set it from the return.
607  if (gridDims.empty()) {
608  gridDims = rewriteResult.mappingSizes;
609  while (gridDims.size() < 3)
610  gridDims.push_back(1);
611  }
612  assert(gridDims.size() == 3 && "Need 3-D gridDims");
613 
614  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
615  // Here, the result of mapping determines the available mapping sizes.
616  replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
617  rewriteResult.mappingSizes);
618 
620 }
621 
624  scf::ForallOp &topLevelForallOp,
625  TransformOpInterface transformOp) {
626  auto walkResult = target->walk([&](scf::ForallOp forallOp) {
627  if (forallOp->getParentOfType<scf::ForallOp>())
628  return WalkResult::advance();
629  if (topLevelForallOp)
630  // TODO: Handle multiple forall if they are independent.
631  return WalkResult::interrupt();
632  topLevelForallOp = forallOp;
633  return WalkResult::advance();
634  });
635 
636  if (walkResult.wasInterrupted() || !topLevelForallOp)
637  return transformOp.emitSilenceableError()
638  << "could not find a unique topLevel scf.forall";
640 }
641 
642 DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
643  transform::TransformRewriter &rewriter, Operation *target,
645  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
646  auto transformOp = cast<TransformOpInterface>(getOperation());
647 
648  if (!getGenerateGpuLaunch() && !gpuLaunch) {
650  emitSilenceableError()
651  << "Given target is not gpu.launch, set `generate_gpu_launch` "
652  "attribute";
653  diag.attachNote(target->getLoc()) << "when applied to this payload op";
654  return diag;
655  }
656 
657  scf::ForallOp topLevelForallOp;
659  target, topLevelForallOp, transformOp);
660  if (!diag.succeeded()) {
661  diag.attachNote(target->getLoc()) << "when applied to this payload op";
662  return diag;
663  }
664  assert(topLevelForallOp && "expect an scf.forall");
665 
666  SmallVector<int64_t> gridDims{getGridDims()};
667  if (!getGenerateGpuLaunch() && gridDims.size() != 3)
668  return transformOp.emitDefiniteFailure("transform require size-3 mapping");
669 
670  OpBuilder::InsertionGuard guard(rewriter);
671  rewriter.setInsertionPoint(topLevelForallOp);
672 
673  // Generate gpu launch here and move the forall inside
674  if (getGenerateGpuLaunch()) {
676  createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
677  if (!diag.succeeded())
678  return diag;
679 
680  rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
681  Operation *newForallOp = rewriter.clone(*topLevelForallOp);
682  rewriter.eraseOp(topLevelForallOp);
683  topLevelForallOp = cast<scf::ForallOp>(newForallOp);
684  }
685 
686  // The BlockIdBuilder adapts to whatever is thrown at it.
687  bool useLinearMapping = false;
688  if (topLevelForallOp.getMapping()) {
689  auto mappingAttr = cast<DeviceMappingAttrInterface>(
690  topLevelForallOp.getMapping()->getValue().front());
691  useLinearMapping = mappingAttr.isLinearMapping();
692  }
693  GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping);
694 
696  rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
697  if (!diag.succeeded())
698  return diag;
699 
700  // Set the GPU launch configuration for the grid dims late, this is
701  // subject to IR inspection.
702  diag = alterGpuLaunch(rewriter, gpuLaunch,
703  cast<TransformOpInterface>(getOperation()), gridDims[0],
704  gridDims[1], gridDims[2]);
705 
706  results.push_back(gpuLaunch);
707  return diag;
708 }
709 
710 LogicalResult transform::MapForallToBlocks::verify() {
711  if (!getGridDims().empty() && getGridDims().size() != 3) {
712  return emitOpError() << "transform requires empty or size-3 grid_dims";
713  }
714  return success();
715 }
716 
717 //===----------------------------------------------------------------------===//
718 // MapNestedForallToThreads
719 //===----------------------------------------------------------------------===//
720 
722  std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
723  ArrayRef<int64_t> numParallelIterations, ArrayRef<int64_t> blockOrGridSizes,
724  int factor, bool useLinearMapping = false) {
725  if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
727  transformOp, forallOp,
728  Twine("3-D mapping: size of threadIdx.x must be a multiple of ") +
729  Twine(factor));
730  return diag;
731  }
732  if (computeProduct(numParallelIterations) * factor >
733  computeProduct(blockOrGridSizes)) {
735  transformOp, forallOp,
736  Twine("the number of required parallel resources (blocks or "
737  "threads) ") +
738  Twine(computeProduct(numParallelIterations) * factor) +
739  " overflows the number of available resources " +
740  Twine(computeProduct(blockOrGridSizes)));
741  return diag;
742  }
744 }
745 
747 getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
748  scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes,
749  int64_t warpSize, GpuIdBuilder &gpuIdBuilder) {
750  auto mappingAttr = cast<DeviceMappingAttrInterface>(
751  forallOp.getMapping()->getValue().front());
752  bool useLinearMapping = mappingAttr.isLinearMapping();
753 
754  // Sanity checks that may result in runtime verification errors.
755  auto numParallelIterations =
756  getConstantIntValues((forallOp.getMixedUpperBound()));
757  if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
758  return definiteFailureHelper(
759  transformOp, forallOp,
760  "requires statically sized, normalized forall op");
761  }
762  int64_t factor = 1;
763  if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
764  factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize;
765  } else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
766  factor = warpSize;
767  }
769  checkMappingSpec(transformOp, forallOp, numParallelIterations.value(),
770  blockSizes, factor, useLinearMapping);
771  if (!diag.succeeded())
772  return diag;
773 
774  // Start mapping.
775  MLIRContext *ctx = forallOp.getContext();
776  gpuIdBuilder =
778  .Case([&](GPUWarpgroupMappingAttr) {
779  return GpuWarpgroupIdBuilder(ctx, warpSize, useLinearMapping);
780  })
781  .Case([&](GPUWarpMappingAttr) {
782  return GpuWarpIdBuilder(ctx, warpSize, useLinearMapping);
783  })
784  .Case([&](GPUThreadMappingAttr) {
785  return GpuThreadIdBuilder(ctx, useLinearMapping);
786  })
787  .Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder {
788  llvm_unreachable("unknown mapping attribute");
789  });
791 }
792 
794  RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
795  scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, int64_t warpSize,
796  bool syncAfterDistribute) {
797 
798  {
799  // GPU-specific verifications. There is no better place to anchor
800  // those right now: the ForallOp is target-independent and the transform
801  // op does not apply to individual ForallOp.
803  verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
804  if (!diag.succeeded())
805  return diag;
806  }
807 
808  GpuIdBuilder gpuIdBuilder;
809  {
810  // Try to construct the id builder, if it fails, return.
812  transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
813  if (!diag.succeeded())
814  return diag;
815  }
816 
817  Location loc = forallOp.getLoc();
818  OpBuilder::InsertionGuard g(rewriter);
819  // Insert after to allow for syncthreads after `forall` is erased.
820  rewriter.setInsertionPointAfter(forallOp);
821  ForallRewriteResult rewriteResult;
823  rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
824  if (!diag.succeeded())
825  return diag;
826  // Add a syncthreads if needed. TODO: warpsync
827  if (syncAfterDistribute)
828  rewriter.create<BarrierOp>(loc);
829 
831 }
832 
834  RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
835  Operation *target, ArrayRef<int64_t> blockDims, int64_t warpSize,
836  bool syncAfterDistribute) {
837  LDBG("Start mapNestedForallToThreadsImpl");
838  if (blockDims.size() != 3) {
839  return definiteFailureHelper(transformOp, target,
840  "requires size-3 thread mapping");
841  }
842 
843  // Create an early zero index value for replacements.
844  Location loc = target->getLoc();
845  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
847  WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
849  rewriter, transformOp, forallOp, blockDims, warpSize,
850  syncAfterDistribute);
851  if (diag.isDefiniteFailure())
852  return WalkResult::interrupt();
853  if (diag.succeeded())
854  return WalkResult::skip();
855  return WalkResult::advance();
856  });
857  if (walkResult.wasInterrupted())
858  return diag;
859 
860  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
861  // Here, the result of mapping determines the available mapping sizes.
862  replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
863  blockDims);
864 
866 }
867 
868 DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
869  transform::TransformRewriter &rewriter, Operation *target,
870  ApplyToEachResultList &results, TransformState &state) {
871  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
872  auto transformOp = cast<TransformOpInterface>(getOperation());
873 
874  // Basic high-level verifications.
875  if (!gpuLaunch)
876  return emitSilenceableError() << "Given target is not a gpu.launch";
877 
878  // Mapping to block ids.
879  SmallVector<int64_t> blockDims{getBlockDims()};
881  checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
882  blockDims[0], blockDims[1], blockDims[2]);
883  if (diag.isSilenceableFailure()) {
884  diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";
885  return diag;
886  }
887 
888  // Set the GPU launch configuration for the block dims early, this is not
889  // subject to IR inspection.
890  diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,
891  std::nullopt, std::nullopt, blockDims[0], blockDims[1],
892  blockDims[2]);
893 
894  rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
895  diag =
896  mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims,
897  getWarpSize(), getSyncAfterDistribute());
898 
899  results.push_back(gpuLaunch.getOperation());
900  return diag;
901 }
902 
903 //===----------------------------------------------------------------------===//
904 // Transform op registration
905 //===----------------------------------------------------------------------===//
906 
907 namespace {
908 /// Registers new ops and declares PDL as dependent dialect since the
909 /// additional ops are using PDL types for operands and results.
910 class GPUTransformDialectExtension
912  GPUTransformDialectExtension> {
913 public:
914  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GPUTransformDialectExtension)
915 
916  GPUTransformDialectExtension() {
917  declareGeneratedDialect<scf::SCFDialect>();
918  declareGeneratedDialect<arith::ArithDialect>();
919  declareGeneratedDialect<GPUDialect>();
920  registerTransformOps<
921 #define GET_OP_LIST
922 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
923  >();
924  }
925 };
926 } // namespace
927 
928 #define GET_OP_CLASSES
929 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
930 
932  registry.addExtensions<GPUTransformDialectExtension>();
933 }
static constexpr int64_t kSharedMemorySpace
static DiagnosedSilenceableFailure checkMappingAttributeTypes(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
Check if given mapping attributes are one of the desired attributes.
static std::optional< SmallVector< int64_t > > getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k)
Returns the target vector size for the target operation based on the native vector size specified wit...
static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > availableMappingSizes, ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder)
static DiagnosedSilenceableFailure definiteFailureHelper(std::optional< TransformOpInterface > transformOp, Operation *target, const Twine &message)
static DiagnosedSilenceableFailure checkMappingSpec(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > numParallelIterations, ArrayRef< int64_t > blockOrGridSizes, int factor, bool useLinearMapping=false)
static void replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc, OperationOrBlock *parent, Value replacement, ArrayRef< int64_t > availableMappingSizes)
Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
static std::optional< SmallVector< int64_t > > gpuMmaUnrollOrder(vector::ContractionOp contract)
Pick an unrolling order that will allow tensorcore operation to reuse LHS register.
static DiagnosedSilenceableFailure verifyGpuMapping(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp)
#define LDBG(X)
static DiagnosedSilenceableFailure getThreadIdBuilder(std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockSizes, int64_t warpSize, GpuIdBuilder &gpuIdBuilder)
static MLIRContext * getContext(OpFoldResult val)
static std::string diag(const llvm::Value &value)
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
#define MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CLASS_NAME)
Definition: TypeID.h:331
Base type for affine expression.
Definition: AffineExpr.h:68
Attributes are known-constant values of operations.
Definition: Attributes.h:25
Block represents an ordered list of Operations.
Definition: Block.h:33
OpListType::iterator iterator
Definition: Block.h:140
OpListType & getOperations()
Definition: Block.h:137
Operation & front()
Definition: Block.h:153
iterator begin()
Definition: Block.h:143
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
bool succeeded() const
Returns true if this is a success.
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtensions()
Add the given extensions to the registry.
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
auto lookup(T from) const
Lookup a mapped value within the map.
Definition: IRMapping.h:72
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
Definition: IRMapping.h:30
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:35
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:346
Block::iterator getInsertionPoint() const
Returns the current insertion point of the builder.
Definition: Builders.h:443
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:549
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:429
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:396
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:453
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:410
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
Definition: Operation.h:798
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
result_type_range getResultTypes()
Definition: Operation.h:428
user_range getUsers()
Returns a range of all users.
Definition: Operation.h:874
unsigned getNumResults()
Return the number of results held by this operation.
Definition: Operation.h:404
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:362
void replaceAllUsesWith(Value from, Value to)
Find uses of from and replace them with to.
Definition: PatternMatch.h:606
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
Type conversion class.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
A utility result that is used to signal how to proceed with an ongoing walk:
Definition: Visitors.h:33
static WalkResult skip()
Definition: Visitors.h:52
static WalkResult advance()
Definition: Visitors.h:51
bool wasInterrupted() const
Returns true if the walk was interrupted.
Definition: Visitors.h:55
static WalkResult interrupt()
Definition: Visitors.h:50
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:93
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Definition: GPUDialect.h:131
A list of results of applying a transform op with ApplyEachOpTrait to a single payload operation,...
void push_back(Operation *op)
Appends an element to the list.
Base class for extensions of the Transform dialect that supports injecting operations into the Transf...
This is a special rewriter to be used in transform op implementations, providing additional helper fu...
The state maintained across applications of various ops implementing the TransformOpInterface.
@ kGlobalMemorySpace
Global memory space identifier.
Definition: NVVMDialect.h:38
bool hasElementwiseMappableTraits(Operation *op)
Together, Elementwise, Scalarizable, Vectorizable, and Tensorizable provide an easy way for scalar op...
Definition: Operation.cpp:1393
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:344
void registerTransformDialectExtension(DialectRegistry &registry)
uint64_t getN(LevelType lt)
Definition: Enums.h:442
uint64_t getM(LevelType lt)
Definition: Enums.h:443
DiagnosedSilenceableFailure mapOneForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, scf::ForallOp forallOp, ArrayRef< int64_t > blockDims, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
DiagnosedSilenceableFailure findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp, TransformOpInterface transformOp)
Find the unique top level scf::ForallOp within a given target op.
DiagnosedSilenceableFailure alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Alter kernel configuration of the given kernel.
DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Create an empty-body gpu::LaunchOp using the provided kernel settings and put a terminator within.
DiagnosedSilenceableFailure mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp, scf::ForallOp forallOp, SmallVectorImpl< int64_t > &gridDims, const GpuIdBuilder &gpuIdBuilder)
Map the top level scf.forall op to GPU blocks.
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional< int64_t > gridDimX, std::optional< int64_t > gridDimY, std::optional< int64_t > gridDimZ, std::optional< int64_t > blockDimX, std::optional< int64_t > blockDimY, std::optional< int64_t > blockDimZ)
Determine if the size of the kernel configuration is supported by the GPU architecture being used.
Definition: Utils.cpp:224
DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(RewriterBase &rewriter, std::optional< TransformOpInterface > transformOp, Operation *target, ArrayRef< int64_t > blockDims, int64_t warpSize, bool syncAfterDistribute)
Search scf.forall ops nested under target and map each such op to an explicit GPU implementation alon...
bool isReductionIterator(Attribute attr)
Returns true if attr has "reduction" iterator type semantics.
Definition: VectorOps.h:152
bool isParallelIterator(Attribute attr)
Returns true if attr has "parallel" iterator type semantics.
Definition: VectorOps.h:147
Include the generated interface declarations.
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:87
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
DiagnosedDefiniteFailure emitDefiniteFailure(Location loc, const Twine &message={})
Emits a definite failure with the given message.
const FrozenRewritePatternSet & patterns
void populateGpuSubgroupReduceOpLoweringPattern(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Populate GpuSubgroupReduce pattern to NVVM.
void populateGpuToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert from the GPU dialect to NVVM.
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
std::optional< SmallVector< int64_t > > getConstantIntValues(ArrayRef< OpFoldResult > ofrs)
If all ofrs are constant integers or IntegerAttrs, return the integers.
LogicalResult verify(Operation *op, bool verifyRecursively=true)
Perform (potentially expensive) checks of invariants, used to detect compiler bugs,...
Definition: Verifier.cpp:424
SmallVector< Value > getValuesSortedByKey(ArrayRef< Attribute > keys, ArrayRef< Value > values, llvm::function_ref< bool(Attribute, Attribute)> compare)
Helper to sort values according to matching keys.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns)
Erase barriers that do not enforce conflicting memory side effects.
void populateGpuWMMAToNVVMConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, PatternBenefit benefit=1)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
Struct to return the result of the rewrite of a forall operation.
SmallVector< Value > mappingIds
SmallVector< int64_t > mappingSizes
Builder for gpu::BlockIdOps used to map scf.forall to blocks.
Definition: Utils.h:83
Helper struct for configuring the rewrite of mapped scf.forall ops to various gpu id configurations.
Definition: Utils.h:63
SmallVector< DeviceMappingAttrInterface > mappingAttributes
The mapping attributes targeted by this generator.
Definition: Utils.h:72
GpuIdBuilderFnType idBuilder
The constructor that builds the concrete IR for mapping ids.
Definition: Utils.h:75
Builder for warp ids used to map scf.forall to reindexed threads.
Definition: Utils.h:116
Builder for warp ids used to map scf.forall to reindexed warps.
Definition: Utils.h:105
Builder for warpgroup ids used to map scf.forall to reindexed warpgroups.
Definition: Utils.h:92
Helper type for functions that generate ids for the mapping of a scf.forall.
Definition: Utils.h:38
SmallVector< int64_t > availableMappingSizes
Definition: Utils.h:43
SmallVector< Value > mappingIdOps
Definition: Utils.h:40
SmallVector< Value > activeIdOps
Definition: Utils.h:49
SmallVector< int64_t > activeMappingSizes
Definition: Utils.h:46
Options that control the vector unrolling.