MLIR 22.0.0git
Utils.cpp
Go to the documentation of this file.
1//===- Utils.cpp - Utils for GPU transform ops ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10
20#include "mlir/IR/AffineExpr.h"
21#include "mlir/IR/Builders.h"
23#include "mlir/IR/MLIRContext.h"
25#include "mlir/IR/Value.h"
26#include "mlir/IR/Visitors.h"
27#include "mlir/Support/LLVM.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SmallVector.h"
30#include "llvm/Support/DebugLog.h"
31#include "llvm/Support/InterleavedRange.h"
32
33using namespace mlir;
34using namespace mlir::gpu;
35using namespace mlir::transform;
36using namespace mlir::transform::gpu;
37
38#define DEBUG_TYPE "gpu-transforms"
39
40/// Build predicates to filter execution by only the activeIds. Along each
41/// dimension, 3 cases appear:
42/// 1. activeMappingSize > availableMappingSize: this is an unsupported case
43/// as this requires additional looping. An error message is produced to
44/// advise the user to tile more or to use more threads.
45/// 2. activeMappingSize == availableMappingSize: no predication is needed.
46/// 3. activeMappingSize < availableMappingSize: only a subset of threads
47/// should be active and we produce the boolean `id < activeMappingSize`
48/// for further use in building predicated execution.
49static FailureOr<SmallVector<Value>>
51 ArrayRef<int64_t> activeMappingSizes,
52 ArrayRef<int64_t> availableMappingSizes,
53 std::string &errorMsg) {
54 LDBG() << "----activeMappingSizes: " << llvm::interleaved(activeMappingSizes);
55 LDBG() << "----availableMappingSizes: "
56 << llvm::interleaved(availableMappingSizes);
57
58 SmallVector<Value> predicateOps;
59 for (auto [activeId, activeMappingSize, availableMappingSize] :
60 llvm::zip_equal(activeIds, activeMappingSizes, availableMappingSizes)) {
61 if (activeMappingSize > availableMappingSize) {
62 errorMsg = "Trying to map to fewer GPU threads than loop iterations but "
63 "overprovisioning is not yet supported. Try additional tiling "
64 "before mapping or map to more threads.";
65 return failure();
66 }
67 if (activeMappingSize == availableMappingSize)
68 continue;
69 Value idx =
70 arith::ConstantIndexOp::create(rewriter, loc, activeMappingSize);
71 Value pred = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::ult,
72 activeId, idx);
73 predicateOps.push_back(pred);
74 }
75 return predicateOps;
76}
77
78/// Return a flattened thread id for the workgroup with given sizes.
79template <typename ThreadOrBlockIdOp>
81 ArrayRef<OpFoldResult> originalBasisOfr) {
82 LDBG() << "----buildLinearId with originalBasisOfr: "
83 << llvm::interleaved(originalBasisOfr);
84 assert(originalBasisOfr.size() == 3 && "expected 3 sizes");
85 IndexType indexType = rewriter.getIndexType();
86 AffineExpr tx, ty, tz, bdx, bdy;
87 bindDims(rewriter.getContext(), tx, ty, tz);
88 bindSymbols(rewriter.getContext(), bdx, bdy);
90 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::x)
91 .getResult(),
92 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::y)
93 .getResult(),
94 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::z)
95 .getResult(),
96 originalBasisOfr[0], originalBasisOfr[1]};
98 rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals);
99 return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
100}
101
102/// Create a linear id builder that takes the `originalBasisOfr` and decompose
103/// it in the basis of `forallMappingSizes`. The linear id builder returns an
104/// n-D vector of ids for indexing and 1-D size + id for predicate generation.
105template <typename ThreadOrBlockIdOp>
108 DeviceMaskingAttrInterface mask = nullptr) {
109 auto res = [multiplicity, mask](RewriterBase &rewriter, Location loc,
110 ArrayRef<int64_t> forallMappingSizes,
111 ArrayRef<int64_t> originalBasis) {
112 // 0. Early-exit mask case.
113 if (mask) {
114 if (computeProduct(originalBasis) >
115 mask.getMaxNumPhysicalIds() * multiplicity) {
116 return IdBuilderResult{
117 /*errorMsg=*/std::string(
118 "mask representation too short to capture all physical ids: ") +
119 std::to_string(mask.getMaxNumPhysicalIds()),
120 /*mappingIdOps=*/{},
121 /*predicateOps=*/{}};
122 }
123 }
124
125 // 1. Compute linearId.
126 SmallVector<OpFoldResult> originalBasisOfr =
127 getAsIndexOpFoldResult(rewriter.getContext(), originalBasis);
128 Value physicalLinearId =
129 buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
130
131 // 2. Compute scaledLinearId.
132 AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
134 rewriter, loc, d0.floorDiv(multiplicity), {physicalLinearId});
135
136 // 2.b. Adjust with mask if needed.
137 Value scaledLinearIdI64;
138 Value scaledLinearId =
139 getValueOrCreateConstantIndexOp(rewriter, loc, scaledLinearIdOfr);
140 if (mask) {
141 scaledLinearId =
142 getValueOrCreateConstantIndexOp(rewriter, loc, scaledLinearIdOfr);
143 scaledLinearIdI64 = arith::IndexCastUIOp::create(
144 rewriter, loc, rewriter.getI64Type(), scaledLinearId);
145 Value logicalLinearIdI64 =
146 mask.createLogicalLinearMappingId(rewriter, scaledLinearIdI64);
147 scaledLinearId = arith::IndexCastUIOp::create(
148 rewriter, loc, rewriter.getIndexType(), logicalLinearIdI64);
149 LDBG() << "------adjusting linearId with mask: " << scaledLinearId;
150 }
151
152 // 3. Compute remapped indices.
154 // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
155 // "row-major" order.
156 SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
157 SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
158 SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
159 // Reverse back to be in [0 .. n] order.
160 for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
161 ids.push_back(
162 affine::makeComposedAffineApply(rewriter, loc, e, {scaledLinearId}));
163 }
164
165 std::string errorMsg;
166 SmallVector<Value> predicateOps;
167 // 4. If mask present, it takes precedence to determine predication.
168 if (mask) {
169 Value isActiveIdPredicate =
170 mask.createIsActiveIdPredicate(rewriter, scaledLinearIdI64);
171 LDBG() << "------adjusting predicate with mask: " << isActiveIdPredicate;
172 predicateOps.push_back(isActiveIdPredicate);
173 } else {
174 // 4.b. Otherwise, handle predicates using physicalLinearId.
175 FailureOr<SmallVector<Value>> maybePredicateOps =
176 buildPredicates(rewriter, loc, physicalLinearId,
177 computeProduct(forallMappingSizes) * multiplicity,
178 computeProduct(originalBasis), errorMsg);
179 if (succeeded(maybePredicateOps))
180 predicateOps = *maybePredicateOps;
181 }
182
183 return IdBuilderResult{/*errorMsg=*/errorMsg,
184 /*mappingIdOps=*/ids,
185 /*predicateOps=*/predicateOps};
186 };
187
188 return res;
189}
190
191/// Create a simple 3-D id builder that takes the `originalBasisOfr`
192/// The 3-D id builder returns a 3-D vector of ids for indexing and 3-D sizes
193/// + ids for predicate generation.
194template <typename ThreadOrBlockIdOp>
196 auto res = [multiplicity](RewriterBase &rewriter, Location loc,
197 ArrayRef<int64_t> forallMappingSizes,
198 ArrayRef<int64_t> originalBasis) {
199 IndexType indexType = rewriter.getIndexType();
201 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::x),
202 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::y),
203 ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::z)};
204 // In the 3-D mapping case, scale the first dimension by the multiplicity.
205 SmallVector<Value> scaledIds = ids;
206 AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
207 scaledIds[0] = cast<Value>(affine::makeComposedFoldedAffineApply(
208 rewriter, loc, d0.floorDiv(multiplicity), {scaledIds[0]}));
209 // In the 3-D mapping case, unscale the first dimension by the multiplicity.
210 SmallVector<int64_t> forallMappingSizeInOriginalBasis(forallMappingSizes);
211 forallMappingSizeInOriginalBasis[0] *= multiplicity;
212
213 std::string errorMsg;
214 SmallVector<Value> predicateOps;
215 FailureOr<SmallVector<Value>> maybePredicateOps =
216 buildPredicates(rewriter, loc, ids, forallMappingSizeInOriginalBasis,
217 originalBasis, errorMsg);
218 if (succeeded(maybePredicateOps))
219 predicateOps = *maybePredicateOps;
220
221 return IdBuilderResult{/*errorMsg=*/errorMsg,
222 /*mappingIdOps=*/scaledIds,
223 /*predicateOps=*/predicateOps};
224 };
225 return res;
226}
227
228/// Create a lane id builder that takes the `originalBasis` and decompose
229/// it in the basis of `forallMappingSizes`. The linear id builder returns an
230/// n-D vector of ids for indexing and 1-D size + id for predicate generation.
232 auto res = [warpSize](RewriterBase &rewriter, Location loc,
233 ArrayRef<int64_t> forallMappingSizes,
234 ArrayRef<int64_t> originalBasis) {
235 // 1. Compute linearId.
236 SmallVector<OpFoldResult> originalBasisOfr =
237 getAsIndexOpFoldResult(rewriter.getContext(), originalBasis);
238 Value physicalLinearId =
239 buildLinearId<ThreadIdOp>(rewriter, loc, originalBasisOfr);
240
241 // 2. Compute laneId.
242 AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
244 rewriter, loc, d0 % warpSize, {physicalLinearId});
245
246 // 3. Compute remapped indices.
248 // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
249 // "row-major" order.
250 SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
251 SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
252 SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
253 // Reverse back to be in [0 .. n] order.
254 for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
255 ids.push_back(
256 affine::makeComposedAffineApply(rewriter, loc, e, {laneId}));
257 }
258
259 // 4. Handle predicates using laneId.
260 std::string errorMsg;
261 SmallVector<Value> predicateOps;
262 FailureOr<SmallVector<Value>> maybePredicateOps = buildPredicates(
263 rewriter, loc, cast<Value>(laneId), computeProduct(forallMappingSizes),
264 computeProduct(originalBasis), errorMsg);
265 if (succeeded(maybePredicateOps))
266 predicateOps = *maybePredicateOps;
267
268 return IdBuilderResult{/*errorMsg=*/errorMsg,
269 /*mappingIdOps=*/ids,
270 /*predicateOps=*/predicateOps};
271 };
272
273 return res;
274}
275
276namespace mlir {
277namespace transform {
278namespace gpu {
279
280GpuIdBuilder::GpuIdBuilder(MLIRContext *ctx, bool useLinearMapping,
281 const MappingIdBuilderFnType &fn)
283 if (useLinearMapping) {
284 for (uint64_t d = static_cast<uint64_t>(MappingId::LinearDim0),
285 e = getMaxEnumValForMappingId();
286 d <= e; ++d)
287 mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));
288 } else {
289 for (uint64_t d = static_cast<uint64_t>(MappingId::DimX),
290 e = static_cast<uint64_t>(MappingId::DimZ);
291 d <= e; ++d)
292 mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));
293 }
294}
295
297 DeviceMaskingAttrInterface mask)
298 : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) {
299 return GPUBlockMappingAttr::get(ctx, id);
300 }) {
301 assert((!mask || useLinearMapping) && "mask requires linear mapping");
302 idBuilder = useLinearMapping
303 ? commonLinearIdBuilderFn<BlockIdOp>(/*multiplicity=*/1, mask)
304 : common3DIdBuilderFn<BlockIdOp>(/*multiplicity=*/1);
305}
306
308 bool useLinearMapping,
309 DeviceMaskingAttrInterface mask)
310 : GpuIdBuilder(ctx, useLinearMapping,
311 [](MLIRContext *ctx, MappingId id) {
312 return GPUWarpgroupMappingAttr::get(ctx, id);
313 }),
314 warpSize(warpSize) {
315 assert((!mask || useLinearMapping) && "mask requires linear mapping");
316 idBuilder = useLinearMapping
318 /*multiplicity=*/kNumWarpsPerGroup * warpSize, mask)
319 : common3DIdBuilderFn<ThreadIdOp>(
320 /*multiplicity=*/kNumWarpsPerGroup * warpSize);
321}
322
324 bool useLinearMapping,
325 DeviceMaskingAttrInterface mask)
326 : GpuIdBuilder(ctx, useLinearMapping,
327 [](MLIRContext *ctx, MappingId id) {
328 return GPUWarpMappingAttr::get(ctx, id);
329 }),
330 warpSize(warpSize) {
331 assert((!mask || useLinearMapping) && "mask requires linear mapping");
332 idBuilder = useLinearMapping
334 /*multiplicity=*/warpSize, mask)
335 : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/warpSize);
336}
337
339 DeviceMaskingAttrInterface mask)
340 : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) {
341 return GPUThreadMappingAttr::get(ctx, id);
342 }) {
343 idBuilder =
344 useLinearMapping
345 ? commonLinearIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1, mask)
346 : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1);
347}
348
350 bool unused, DeviceMaskingAttrInterface mask)
351 : GpuIdBuilder(ctx, /*useLinearMapping=*/true,
352 [](MLIRContext *ctx, MappingId id) {
353 return GPULaneMappingAttr::get(ctx, id);
354 }),
355 warpSize(warpSize) {
356 assert(!mask && "mask NYI for lanes, unclear it should be at all");
357 idBuilder = laneIdBuilderFn(/*periodicity=*/warpSize);
358}
359
360DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
361 std::optional<int64_t> gridDimX,
362 std::optional<int64_t> gridDimY,
363 std::optional<int64_t> gridDimZ,
364 std::optional<int64_t> blockDimX,
365 std::optional<int64_t> blockDimY,
366 std::optional<int64_t> blockDimZ) {
367
368 // TODO: pass a configuration object to set the limits properly.
369
370 if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
372 (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
374 blockDimX.value_or(1) > kMaxBlockdimx ||
375 blockDimY.value_or(1) > kMaxBlockdimy ||
376 blockDimZ.value_or(1) > kMaxBlockdimz ||
377 gridDimY.value_or(1) > kMaxGriddimy ||
378 gridDimZ.value_or(1) > kMaxGriddimz ||
379 gridDimX.value_or(1) > kMaxGriddimx) {
380 return transformOp.emitSilenceableError()
381 << "Trying to launch a GPU kernel with grid_dims = ("
382 << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
383 << gridDimZ.value_or(1) << ") block_dims = ("
384 << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
385 << blockDimZ.value_or(1) << "). It is larger than the limits.";
386 }
388}
389
391 RewriterBase &rewriter, Location loc, TransformOpInterface transformOp,
392 LaunchOp &launchOp, std::optional<int64_t> gridDimX,
393 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
394 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
395 std::optional<int64_t> blockDimZ) {
397 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
398 blockDimY, blockDimZ);
399 if (!diag.succeeded())
400 return diag;
401
402 auto createConst = [&](int dim) {
403 return arith::ConstantIndexOp::create(rewriter, loc, dim);
404 };
405 OpBuilder::InsertionGuard guard(rewriter);
406 Value one = createConst(1);
407 Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
408 Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
409 Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
410 Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
411 Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
412 Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
413 launchOp = LaunchOp::create(rewriter, loc, gridSizeX, gridSizeY, gridSizeZ,
414 blkSizeX, blkSizeY, blkSizeZ);
415 rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
416 TerminatorOp::create(rewriter, loc);
418}
419
420/// Alter kernel configuration of the given kernel.
422 RewriterBase &rewriter, LaunchOp gpuLaunch,
423 TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
424 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
425 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
426 std::optional<int64_t> blockDimZ) {
428 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
429 blockDimY, blockDimZ);
430 if (!diag.succeeded())
431 return diag;
432
433 KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
434 OpBuilder::InsertionGuard guard(rewriter);
435 rewriter.setInsertionPointAfterValue(currentBlockdim.x);
436 auto createConstValue = [&](int dim) {
437 return arith::ConstantIndexOp::create(rewriter, currentBlockdim.x.getLoc(),
438 dim);
439 };
440
441 if (gridDimX.has_value())
442 gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
443 if (gridDimY.has_value())
444 gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
445 if (gridDimZ.has_value())
446 gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
447 if (blockDimX.has_value())
448 gpuLaunch.getBlockSizeXMutable().assign(
449 createConstValue(blockDimX.value()));
450 if (blockDimY.has_value())
451 gpuLaunch.getBlockSizeYMutable().assign(
452 createConstValue(blockDimY.value()));
453 if (blockDimZ.has_value())
454 gpuLaunch.getBlockSizeZMutable().assign(
455 createConstValue(blockDimZ.value()));
457}
458
459} // namespace gpu
460} // namespace transform
461} // namespace mlir
static Value createConst(Location loc, Type type, int value, PatternRewriter &rewriter)
Create an integer or index constant.
Definition ExpandOps.cpp:27
static FailureOr< SmallVector< Value > > buildPredicates(RewriterBase &rewriter, Location loc, ArrayRef< Value > activeIds, ArrayRef< int64_t > activeMappingSizes, ArrayRef< int64_t > availableMappingSizes, std::string &errorMsg)
Build predicates to filter execution by only the activeIds.
Definition Utils.cpp:50
static Value buildLinearId(RewriterBase &rewriter, Location loc, ArrayRef< OpFoldResult > originalBasisOfr)
Return a flattened thread id for the workgroup with given sizes.
Definition Utils.cpp:80
static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity=1)
Create a simple 3-D id builder that takes the originalBasisOfr The 3-D id builder returns a 3-D vecto...
Definition Utils.cpp:195
static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity=1, DeviceMaskingAttrInterface mask=nullptr)
Create a linear id builder that takes the originalBasisOfr and decompose it in the basis of forallMap...
Definition Utils.cpp:107
static GpuIdBuilderFnType laneIdBuilderFn(int64_t warpSize)
Create a lane id builder that takes the originalBasis and decompose it in the basis of forallMappingS...
Definition Utils.cpp:231
true
Given two iterators into the same block, return "true" if a is before `b.
static std::string diag(const llvm::Value &value)
constexpr int kMaxGriddimz
constexpr int kMaxTotalBlockdim
constexpr int kMaxGriddimy
constexpr int kMaxBlockdimx
constexpr int kMaxBlockdimz
constexpr int kMaxGriddimx
constexpr int kMaxBlockdimy
constexpr int kMaxTotalGriddim
Base type for affine expression.
Definition AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
MLIRContext * getContext() const
Definition Builders.h:56
IndexType getIndexType()
Definition Builders.cpp:51
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
RAII guard to reset the insertion point of the builder when destroyed.
Definition Builders.h:348
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition Builders.h:436
void setInsertionPointAfterValue(Value val)
Sets the insertion point to the node after the specified value.
Definition Builders.h:421
This class represents a single result from folding an operation.
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition Value.h:96
Location getLoc() const
Return the location of this value.
Definition Value.cpp:24
static ConstantIndexOp create(OpBuilder &builder, Location location, int64_t value)
Definition ArithOps.cpp:359
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands, bool composeAffineMin=false)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
std::function< IdBuilderResult( RewriterBase &, Location, ArrayRef< int64_t >, ArrayRef< int64_t >)> GpuIdBuilderFnType
Common gpu id builder type, allows the configuration of lowering for various mapping schemes.
Definition Utils.h:55
DiagnosedSilenceableFailure alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Alter kernel configuration of the given kernel.
DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Create an empty-body gpu::LaunchOp using the provided kernel settings and put a terminator within.
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional< int64_t > gridDimX, std::optional< int64_t > gridDimY, std::optional< int64_t > gridDimZ, std::optional< int64_t > blockDimX, std::optional< int64_t > blockDimY, std::optional< int64_t > blockDimZ)
Determine if the size of the kernel configuration is supported by the GPU architecture being used.
Definition Utils.cpp:360
Include the generated interface declarations.
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition AffineExpr.h:311
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition AffineExpr.h:325
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition Utils.cpp:111
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition GPUDialect.h:39
GpuBlockIdBuilder(MLIRContext *ctx, bool useLinearMapping=false, DeviceMaskingAttrInterface mask=nullptr)
Definition Utils.cpp:296
std::function< DeviceMappingAttrInterface( MLIRContext *, mlir::gpu::MappingId)> MappingIdBuilderFnType
Definition Utils.h:61
SmallVector< DeviceMappingAttrInterface > mappingAttributes
The mapping attributes targeted by this generator.
Definition Utils.h:69
GpuIdBuilderFnType idBuilder
The constructor that builds the concrete IR for mapping ids.
Definition Utils.h:72
GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize, bool unused, DeviceMaskingAttrInterface mask=nullptr)
Definition Utils.cpp:349
GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping=false, DeviceMaskingAttrInterface mask=nullptr)
Definition Utils.cpp:338
GpuWarpIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping=false, DeviceMaskingAttrInterface mask=nullptr)
Definition Utils.cpp:323
GpuWarpgroupIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping=false, DeviceMaskingAttrInterface mask=nullptr)
Definition Utils.cpp:307
Helper type for functions that generate ids for the mapping of a scf.forall.
Definition Utils.h:31