MLIR  22.0.0git
Utils.cpp
Go to the documentation of this file.
1 //===- Utils.cpp - Utils for GPU transform ops ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
20 #include "mlir/IR/AffineExpr.h"
21 #include "mlir/IR/Builders.h"
23 #include "mlir/IR/MLIRContext.h"
24 #include "mlir/IR/OpDefinition.h"
25 #include "mlir/IR/Value.h"
26 #include "mlir/IR/Visitors.h"
27 #include "mlir/Support/LLVM.h"
28 #include "llvm/ADT/STLExtras.h"
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/Support/DebugLog.h"
31 #include "llvm/Support/InterleavedRange.h"
32 
33 using namespace mlir;
34 using namespace mlir::gpu;
35 using namespace mlir::transform;
36 using namespace mlir::transform::gpu;
37 
38 #define DEBUG_TYPE "gpu-transforms"
39 
40 /// Build predicates to filter execution by only the activeIds. Along each
41 /// dimension, 3 cases appear:
42 /// 1. activeMappingSize > availableMappingSize: this is an unsupported case
43 /// as this requires additional looping. An error message is produced to
44 /// advise the user to tile more or to use more threads.
45 /// 2. activeMappingSize == availableMappingSize: no predication is needed.
46 /// 3. activeMappingSize < availableMappingSize: only a subset of threads
47 /// should be active and we produce the boolean `id < activeMappingSize`
48 /// for further use in building predicated execution.
49 static FailureOr<SmallVector<Value>>
51  ArrayRef<int64_t> activeMappingSizes,
52  ArrayRef<int64_t> availableMappingSizes,
53  std::string &errorMsg) {
54  LDBG() << "----activeMappingSizes: " << llvm::interleaved(activeMappingSizes);
55  LDBG() << "----availableMappingSizes: "
56  << llvm::interleaved(availableMappingSizes);
57 
58  SmallVector<Value> predicateOps;
59  for (auto [activeId, activeMappingSize, availableMappingSize] :
60  llvm::zip_equal(activeIds, activeMappingSizes, availableMappingSizes)) {
61  if (activeMappingSize > availableMappingSize) {
62  errorMsg = "Trying to map to fewer GPU threads than loop iterations but "
63  "overprovisioning is not yet supported. Try additional tiling "
64  "before mapping or map to more threads.";
65  return failure();
66  }
67  if (activeMappingSize == availableMappingSize)
68  continue;
69  Value idx =
70  arith::ConstantIndexOp::create(rewriter, loc, activeMappingSize);
71  Value pred = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::ult,
72  activeId, idx);
73  predicateOps.push_back(pred);
74  }
75  return predicateOps;
76 }
77 
78 /// Return a flattened thread id for the workgroup with given sizes.
79 template <typename ThreadOrBlockIdOp>
80 static Value buildLinearId(RewriterBase &rewriter, Location loc,
81  ArrayRef<OpFoldResult> originalBasisOfr) {
82  LDBG() << "----buildLinearId with originalBasisOfr: "
83  << llvm::interleaved(originalBasisOfr);
84  assert(originalBasisOfr.size() == 3 && "expected 3 sizes");
85  IndexType indexType = rewriter.getIndexType();
86  AffineExpr tx, ty, tz, bdx, bdy;
87  bindDims(rewriter.getContext(), tx, ty, tz);
88  bindSymbols(rewriter.getContext(), bdx, bdy);
90  ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::x)
91  .getResult(),
92  ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::y)
93  .getResult(),
94  ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::z)
95  .getResult(),
96  originalBasisOfr[0], originalBasisOfr[1]};
98  rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals);
99  return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
100 }
101 
102 /// Create a linear id builder that takes the `originalBasisOfr` and decompose
103 /// it in the basis of `forallMappingSizes`. The linear id builder returns an
104 /// n-D vector of ids for indexing and 1-D size + id for predicate generation.
105 template <typename ThreadOrBlockIdOp>
106 static GpuIdBuilderFnType
107 commonLinearIdBuilderFn(int64_t multiplicity = 1,
108  DeviceMaskingAttrInterface mask = nullptr) {
109  auto res = [multiplicity, mask](RewriterBase &rewriter, Location loc,
110  ArrayRef<int64_t> forallMappingSizes,
111  ArrayRef<int64_t> originalBasis) {
112  // 0. Early-exit mask case.
113  if (mask) {
114  if (computeProduct(originalBasis) >
115  mask.getMaxNumPhysicalIds() * multiplicity) {
116  return IdBuilderResult{
117  /*errorMsg=*/std::string(
118  "mask representation too short to capture all physical ids: ") +
119  std::to_string(mask.getMaxNumPhysicalIds()),
120  /*mappingIdOps=*/{},
121  /*predicateOps=*/{}};
122  }
123  }
124 
125  // 1. Compute linearId.
126  SmallVector<OpFoldResult> originalBasisOfr =
127  getAsIndexOpFoldResult(rewriter.getContext(), originalBasis);
128  Value physicalLinearId =
129  buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
130 
131  // 2. Compute scaledLinearId.
132  AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
134  rewriter, loc, d0.floorDiv(multiplicity), {physicalLinearId});
135 
136  // 2.b. Adjust with mask if needed.
137  Value scaledLinearIdI64;
138  Value scaledLinearId =
139  getValueOrCreateConstantIndexOp(rewriter, loc, scaledLinearIdOfr);
140  if (mask) {
141  scaledLinearId =
142  getValueOrCreateConstantIndexOp(rewriter, loc, scaledLinearIdOfr);
143  scaledLinearIdI64 = arith::IndexCastUIOp::create(
144  rewriter, loc, rewriter.getI64Type(), scaledLinearId);
145  Value logicalLinearIdI64 =
146  mask.createLogicalLinearMappingId(rewriter, scaledLinearIdI64);
147  scaledLinearId = arith::IndexCastUIOp::create(
148  rewriter, loc, rewriter.getIndexType(), logicalLinearIdI64);
149  LDBG() << "------adjusting linearId with mask: " << scaledLinearId;
150  }
151 
152  // 3. Compute remapped indices.
153  SmallVector<Value> ids;
154  // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
155  // "row-major" order.
156  SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
157  SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
158  SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
159  // Reverse back to be in [0 .. n] order.
160  for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
161  ids.push_back(
162  affine::makeComposedAffineApply(rewriter, loc, e, {scaledLinearId}));
163  }
164 
165  std::string errorMsg;
166  SmallVector<Value> predicateOps;
167  // 4. If mask present, it takes precedence to determine predication.
168  if (mask) {
169  Value isActiveIdPredicate =
170  mask.createIsActiveIdPredicate(rewriter, scaledLinearIdI64);
171  LDBG() << "------adjusting predicate with mask: " << isActiveIdPredicate;
172  predicateOps.push_back(isActiveIdPredicate);
173  } else {
174  // 4.b. Otherwise, handle predicates using physicalLinearId.
175  FailureOr<SmallVector<Value>> maybePredicateOps =
176  buildPredicates(rewriter, loc, physicalLinearId,
177  computeProduct(forallMappingSizes) * multiplicity,
178  computeProduct(originalBasis), errorMsg);
179  if (succeeded(maybePredicateOps))
180  predicateOps = *maybePredicateOps;
181  }
182 
183  return IdBuilderResult{/*errorMsg=*/errorMsg,
184  /*mappingIdOps=*/ids,
185  /*predicateOps=*/predicateOps};
186  };
187 
188  return res;
189 }
190 
191 /// Create a simple 3-D id builder that takes the `originalBasisOfr`
192 /// The 3-D id builder returns a 3-D vector of ids for indexing and 3-D sizes
193 /// + ids for predicate generation.
194 template <typename ThreadOrBlockIdOp>
195 static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) {
196  auto res = [multiplicity](RewriterBase &rewriter, Location loc,
197  ArrayRef<int64_t> forallMappingSizes,
198  ArrayRef<int64_t> originalBasis) {
199  IndexType indexType = rewriter.getIndexType();
200  SmallVector<Value> ids{
201  ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::x),
202  ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::y),
203  ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::z)};
204  // In the 3-D mapping case, scale the first dimension by the multiplicity.
205  SmallVector<Value> scaledIds = ids;
206  AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
207  scaledIds[0] = cast<Value>(affine::makeComposedFoldedAffineApply(
208  rewriter, loc, d0.floorDiv(multiplicity), {scaledIds[0]}));
209  // In the 3-D mapping case, unscale the first dimension by the multiplicity.
210  SmallVector<int64_t> forallMappingSizeInOriginalBasis(forallMappingSizes);
211  forallMappingSizeInOriginalBasis[0] *= multiplicity;
212 
213  std::string errorMsg;
214  SmallVector<Value> predicateOps;
215  FailureOr<SmallVector<Value>> maybePredicateOps =
216  buildPredicates(rewriter, loc, ids, forallMappingSizeInOriginalBasis,
217  originalBasis, errorMsg);
218  if (succeeded(maybePredicateOps))
219  predicateOps = *maybePredicateOps;
220 
221  return IdBuilderResult{/*errorMsg=*/errorMsg,
222  /*mappingIdOps=*/scaledIds,
223  /*predicateOps=*/predicateOps};
224  };
225  return res;
226 }
227 
228 /// Create a lane id builder that takes the `originalBasis` and decompose
229 /// it in the basis of `forallMappingSizes`. The linear id builder returns an
230 /// n-D vector of ids for indexing and 1-D size + id for predicate generation.
231 static GpuIdBuilderFnType laneIdBuilderFn(int64_t warpSize) {
232  auto res = [warpSize](RewriterBase &rewriter, Location loc,
233  ArrayRef<int64_t> forallMappingSizes,
234  ArrayRef<int64_t> originalBasis) {
235  // 1. Compute linearId.
236  SmallVector<OpFoldResult> originalBasisOfr =
237  getAsIndexOpFoldResult(rewriter.getContext(), originalBasis);
238  Value physicalLinearId =
239  buildLinearId<ThreadIdOp>(rewriter, loc, originalBasisOfr);
240 
241  // 2. Compute laneId.
242  AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
244  rewriter, loc, d0 % warpSize, {physicalLinearId});
245 
246  // 3. Compute remapped indices.
247  SmallVector<Value> ids;
248  // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
249  // "row-major" order.
250  SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
251  SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
252  SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
253  // Reverse back to be in [0 .. n] order.
254  for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
255  ids.push_back(
256  affine::makeComposedAffineApply(rewriter, loc, e, {laneId}));
257  }
258 
259  // 4. Handle predicates using laneId.
260  std::string errorMsg;
261  SmallVector<Value> predicateOps;
262  FailureOr<SmallVector<Value>> maybePredicateOps = buildPredicates(
263  rewriter, loc, cast<Value>(laneId), computeProduct(forallMappingSizes),
264  computeProduct(originalBasis), errorMsg);
265  if (succeeded(maybePredicateOps))
266  predicateOps = *maybePredicateOps;
267 
268  return IdBuilderResult{/*errorMsg=*/errorMsg,
269  /*mappingIdOps=*/ids,
270  /*predicateOps=*/predicateOps};
271  };
272 
273  return res;
274 }
275 
276 namespace mlir {
277 namespace transform {
278 namespace gpu {
279 
280 GpuIdBuilder::GpuIdBuilder(MLIRContext *ctx, bool useLinearMapping,
281  const MappingIdBuilderFnType &fn)
282  : mappingAttributes(), idBuilder() {
283  if (useLinearMapping) {
284  for (uint64_t d = static_cast<uint64_t>(MappingId::LinearDim0),
285  e = getMaxEnumValForMappingId();
286  d <= e; ++d)
287  mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));
288  } else {
289  for (uint64_t d = static_cast<uint64_t>(MappingId::DimX),
290  e = static_cast<uint64_t>(MappingId::DimZ);
291  d <= e; ++d)
292  mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));
293  }
294 }
295 
297  DeviceMaskingAttrInterface mask)
298  : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) {
299  return GPUBlockMappingAttr::get(ctx, id);
300  }) {
301  assert((!mask || useLinearMapping) && "mask requires linear mapping");
302  idBuilder = useLinearMapping
303  ? commonLinearIdBuilderFn<BlockIdOp>(/*multiplicity=*/1, mask)
304  : common3DIdBuilderFn<BlockIdOp>(/*multiplicity=*/1);
305 }
306 
308  bool useLinearMapping,
309  DeviceMaskingAttrInterface mask)
310  : GpuIdBuilder(ctx, useLinearMapping,
311  [](MLIRContext *ctx, MappingId id) {
312  return GPUWarpgroupMappingAttr::get(ctx, id);
313  }),
314  warpSize(warpSize) {
315  assert((!mask || useLinearMapping) && "mask requires linear mapping");
316  idBuilder = useLinearMapping
317  ? commonLinearIdBuilderFn<ThreadIdOp>(
318  /*multiplicity=*/kNumWarpsPerGroup * warpSize, mask)
319  : common3DIdBuilderFn<ThreadIdOp>(
320  /*multiplicity=*/kNumWarpsPerGroup * warpSize);
321 }
322 
324  bool useLinearMapping,
325  DeviceMaskingAttrInterface mask)
326  : GpuIdBuilder(ctx, useLinearMapping,
327  [](MLIRContext *ctx, MappingId id) {
328  return GPUWarpMappingAttr::get(ctx, id);
329  }),
330  warpSize(warpSize) {
331  assert((!mask || useLinearMapping) && "mask requires linear mapping");
332  idBuilder = useLinearMapping
333  ? commonLinearIdBuilderFn<ThreadIdOp>(
334  /*multiplicity=*/warpSize, mask)
335  : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/warpSize);
336 }
337 
339  DeviceMaskingAttrInterface mask)
340  : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) {
341  return GPUThreadMappingAttr::get(ctx, id);
342  }) {
343  idBuilder =
344  useLinearMapping
345  ? commonLinearIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1, mask)
346  : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1);
347 }
348 
350  bool unused, DeviceMaskingAttrInterface mask)
351  : GpuIdBuilder(ctx, /*useLinearMapping=*/true,
352  [](MLIRContext *ctx, MappingId id) {
353  return GPULaneMappingAttr::get(ctx, id);
354  }),
355  warpSize(warpSize) {
356  assert(!mask && "mask NYI for lanes, unclear it should be at all");
357  idBuilder = laneIdBuilderFn(/*periodicity=*/warpSize);
358 }
359 
360 DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
361  std::optional<int64_t> gridDimX,
362  std::optional<int64_t> gridDimY,
363  std::optional<int64_t> gridDimZ,
364  std::optional<int64_t> blockDimX,
365  std::optional<int64_t> blockDimY,
366  std::optional<int64_t> blockDimZ) {
367 
368  // TODO: pass a configuration object to set the limits properly.
369 
370  if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
372  (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
374  blockDimX.value_or(1) > kMaxBlockdimx ||
375  blockDimY.value_or(1) > kMaxBlockdimy ||
376  blockDimZ.value_or(1) > kMaxBlockdimz ||
377  gridDimY.value_or(1) > kMaxGriddimy ||
378  gridDimZ.value_or(1) > kMaxGriddimz ||
379  gridDimX.value_or(1) > kMaxGriddimx) {
380  return transformOp.emitSilenceableError()
381  << "Trying to launch a GPU kernel with grid_dims = ("
382  << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
383  << gridDimZ.value_or(1) << ") block_dims = ("
384  << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
385  << blockDimZ.value_or(1) << "). It is larger than the limits.";
386  }
388 }
389 
391  RewriterBase &rewriter, Location loc, TransformOpInterface transformOp,
392  LaunchOp &launchOp, std::optional<int64_t> gridDimX,
393  std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
394  std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
395  std::optional<int64_t> blockDimZ) {
397  checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
398  blockDimY, blockDimZ);
399  if (!diag.succeeded())
400  return diag;
401 
402  auto createConst = [&](int dim) {
403  return arith::ConstantIndexOp::create(rewriter, loc, dim);
404  };
405  OpBuilder::InsertionGuard guard(rewriter);
406  Value one = createConst(1);
407  Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
408  Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
409  Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
410  Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
411  Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
412  Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
413  launchOp = LaunchOp::create(rewriter, loc, gridSizeX, gridSizeY, gridSizeZ,
414  blkSizeX, blkSizeY, blkSizeZ);
415  rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
416  TerminatorOp::create(rewriter, loc);
418 }
419 
420 /// Alter kernel configuration of the given kernel.
422  RewriterBase &rewriter, LaunchOp gpuLaunch,
423  TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
424  std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
425  std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
426  std::optional<int64_t> blockDimZ) {
428  checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
429  blockDimY, blockDimZ);
430  if (!diag.succeeded())
431  return diag;
432 
433  KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
434  OpBuilder::InsertionGuard guard(rewriter);
435  rewriter.setInsertionPointAfterValue(currentBlockdim.x);
436  auto createConstValue = [&](int dim) {
437  return arith::ConstantIndexOp::create(rewriter, currentBlockdim.x.getLoc(),
438  dim);
439  };
440 
441  if (gridDimX.has_value())
442  gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
443  if (gridDimY.has_value())
444  gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
445  if (gridDimZ.has_value())
446  gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
447  if (blockDimX.has_value())
448  gpuLaunch.getBlockSizeXMutable().assign(
449  createConstValue(blockDimX.value()));
450  if (blockDimY.has_value())
451  gpuLaunch.getBlockSizeYMutable().assign(
452  createConstValue(blockDimY.value()));
453  if (blockDimZ.has_value())
454  gpuLaunch.getBlockSizeZMutable().assign(
455  createConstValue(blockDimZ.value()));
457 }
458 
459 } // namespace gpu
460 } // namespace transform
461 } // namespace mlir
static Value createConst(Location loc, Type type, int value, PatternRewriter &rewriter)
Create an integer or index constant.
Definition: ExpandOps.cpp:27
static Value buildLinearId(RewriterBase &rewriter, Location loc, ArrayRef< OpFoldResult > originalBasisOfr)
Return a flattened thread id for the workgroup with given sizes.
Definition: Utils.cpp:80
static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity=1)
Create a simple 3-D id builder that takes the originalBasisOfr The 3-D id builder returns a 3-D vecto...
Definition: Utils.cpp:195
static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity=1, DeviceMaskingAttrInterface mask=nullptr)
Create a linear id builder that takes the originalBasisOfr and decompose it in the basis of forallMap...
Definition: Utils.cpp:107
static FailureOr< SmallVector< Value > > buildPredicates(RewriterBase &rewriter, Location loc, ArrayRef< Value > activeIds, ArrayRef< int64_t > activeMappingSizes, ArrayRef< int64_t > availableMappingSizes, std::string &errorMsg)
Build predicates to filter execution by only the activeIds.
Definition: Utils.cpp:50
static GpuIdBuilderFnType laneIdBuilderFn(int64_t warpSize)
Create a lane id builder that takes the originalBasis and decompose it in the basis of forallMappingS...
Definition: Utils.cpp:231
static std::string diag(const llvm::Value &value)
constexpr int kMaxGriddimz
Definition: NVGPUDialect.h:37
constexpr int kMaxTotalBlockdim
Definition: NVGPUDialect.h:30
constexpr int kMaxGriddimy
Definition: NVGPUDialect.h:36
constexpr int kMaxBlockdimx
Definition: NVGPUDialect.h:31
constexpr int kMaxBlockdimz
Definition: NVGPUDialect.h:33
constexpr int kMaxGriddimx
Definition: NVGPUDialect.h:35
constexpr int kMaxBlockdimy
Definition: NVGPUDialect.h:32
constexpr int kMaxTotalGriddim
Definition: NVGPUDialect.h:34
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:959
MLIRContext * getContext() const
Definition: Builders.h:56
IndexType getIndexType()
Definition: Builders.cpp:50
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:348
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:436
void setInsertionPointAfterValue(Value val)
Sets the insertion point to the node after the specified value.
Definition: Builders.h:421
This class represents a single result from folding an operation.
Definition: OpDefinition.h:272
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:358
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:24
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition: ArithOps.cpp:359
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
Definition: AffineOps.cpp:1276
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Definition: AffineOps.cpp:1329
DiagnosedSilenceableFailure alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Alter kernel configuration of the given kernel.
DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Create an empty-body gpu::LaunchOp using the provided kernel settings and put a terminator within.
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional< int64_t > gridDimX, std::optional< int64_t > gridDimY, std::optional< int64_t > gridDimZ, std::optional< int64_t > blockDimX, std::optional< int64_t > blockDimY, std::optional< int64_t > blockDimZ)
Determine if the size of the kernel configuration is supported by the GPU architecture being used.
Definition: Utils.cpp:360
std::function< IdBuilderResult(RewriterBase &, Location, ArrayRef< int64_t >, ArrayRef< int64_t >)> GpuIdBuilderFnType
Common gpu id builder type, allows the configuration of lowering for various mapping schemes.
Definition: Utils.h:56
Include the generated interface declarations.
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:311
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
Definition: IndexingUtils.h:47
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:325
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:111
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Definition: AffineExpr.cpp:619
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition: GPUDialect.h:39
GpuBlockIdBuilder(MLIRContext *ctx, bool useLinearMapping=false, DeviceMaskingAttrInterface mask=nullptr)
Definition: Utils.cpp:296
Helper struct for configuring the rewrite of mapped scf.forall ops to various gpu id configurations.
Definition: Utils.h:60
SmallVector< DeviceMappingAttrInterface > mappingAttributes
The mapping attributes targeted by this generator.
Definition: Utils.h:69
GpuIdBuilderFnType idBuilder
The constructor that builds the concrete IR for mapping ids.
Definition: Utils.h:72
std::function< DeviceMappingAttrInterface(MLIRContext *, mlir::gpu::MappingId)> MappingIdBuilderFnType
Definition: Utils.h:62
GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize, bool unused, DeviceMaskingAttrInterface mask=nullptr)
Definition: Utils.cpp:349
GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping=false, DeviceMaskingAttrInterface mask=nullptr)
Definition: Utils.cpp:338
GpuWarpIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping=false, DeviceMaskingAttrInterface mask=nullptr)
Definition: Utils.cpp:323
GpuWarpgroupIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping=false, DeviceMaskingAttrInterface mask=nullptr)
Definition: Utils.cpp:307
static constexpr int64_t kNumWarpsPerGroup
In the future this may be configured by the transformation.
Definition: Utils.h:98
Helper type for functions that generate ids for the mapping of a scf.forall.
Definition: Utils.h:31