MLIR  21.0.0git
Utils.cpp
Go to the documentation of this file.
1 //===- Utils.cpp - Utils for GPU transform ops ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 
24 #include "mlir/IR/AffineExpr.h"
25 #include "mlir/IR/Builders.h"
27 #include "mlir/IR/IRMapping.h"
28 #include "mlir/IR/MLIRContext.h"
29 #include "mlir/IR/OpDefinition.h"
30 #include "mlir/IR/Value.h"
31 #include "mlir/IR/Visitors.h"
32 #include "mlir/Support/LLVM.h"
33 #include "llvm/ADT/STLExtras.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/TypeSwitch.h"
36 #include "llvm/Support/Debug.h"
37 
38 using namespace mlir;
39 using namespace mlir::gpu;
40 using namespace mlir::transform;
41 using namespace mlir::transform::gpu;
42 
43 #define DEBUG_TYPE "gpu-transforms"
44 
45 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
46 #define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")
47 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
48 
49 /// Return a flattened thread id for the workgroup with given sizes.
50 template <typename ThreadOrBlockIdOp>
51 static Value buildLinearId(RewriterBase &rewriter, Location loc,
52  ArrayRef<OpFoldResult> originalBasisOfr) {
53  LLVM_DEBUG(llvm::interleaveComma(
54  originalBasisOfr,
55  DBGS() << "----buildLinearId with originalBasisOfr: ");
56  llvm::dbgs() << "\n");
57  assert(originalBasisOfr.size() == 3 && "expected 3 sizes");
58  IndexType indexType = rewriter.getIndexType();
59  AffineExpr tx, ty, tz, bdx, bdy;
60  bindDims(rewriter.getContext(), tx, ty, tz);
61  bindSymbols(rewriter.getContext(), bdx, bdy);
63  rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x)
64  .getResult(),
65  rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y)
66  .getResult(),
67  rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)
68  .getResult(),
69  originalBasisOfr[0], originalBasisOfr[1]};
71  rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals);
72  return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
73 }
74 
75 /// Create a linear id builder that takes the `originalBasisOfr` and decompose
76 /// it in the basis of `forallMappingSizes`. The linear id builder returns an
77 /// n-D vector of ids for indexing and 1-D size + id for predicate generation.
78 template <typename ThreadOrBlockIdOp>
79 static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity = 1) {
80  auto res = [multiplicity](RewriterBase &rewriter, Location loc,
81  ArrayRef<int64_t> forallMappingSizes,
82  ArrayRef<int64_t> originalBasis) {
83  SmallVector<OpFoldResult> originalBasisOfr =
84  getAsIndexOpFoldResult(rewriter.getContext(), originalBasis);
85  OpFoldResult linearId =
86  buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
87  // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
88  // "row-major" order.
89  SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
90  SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
91  AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
93  rewriter, loc, d0.floorDiv(multiplicity), {linearId});
94  SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
96  // Reverse back to be in [0 .. n] order.
97  for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
98  ids.push_back(
99  affine::makeComposedAffineApply(rewriter, loc, e, {scaledLinearId}));
100  }
101 
102  // clang-format off
103  LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
104  DBGS() << "--delinearization basis: ");
105  llvm::dbgs() << "\n";
106  llvm::interleaveComma(strides,
107  DBGS() << "--delinearization strides: ");
108  llvm::dbgs() << "\n";
109  llvm::interleaveComma(delinearizingExprs,
110  DBGS() << "--delinearization exprs: ");
111  llvm::dbgs() << "\n";
112  llvm::interleaveComma(ids, DBGS() << "--ids: ");
113  llvm::dbgs() << "\n";);
114  // clang-format on
115 
116  // Return n-D ids for indexing and 1-D size + id for predicate generation.
117  return IdBuilderResult{
118  /*mappingIdOps=*/ids,
119  /*availableMappingSizes=*/
120  SmallVector<int64_t>{computeProduct(originalBasis)},
121  // `forallMappingSizes` iterate in the scaled basis, they need to be
122  // scaled back into the original basis to provide tight
123  // activeMappingSizes quantities for predication.
124  /*activeMappingSizes=*/
125  SmallVector<int64_t>{computeProduct(forallMappingSizes) *
126  multiplicity},
127  /*activeIdOps=*/SmallVector<Value>{cast<Value>(linearId)}};
128  };
129 
130  return res;
131 }
132 
133 /// Create a simple 3-D id builder that takes the `originalBasisOfr`
134 /// The 3-D id builder returns a 3-D vector of ids for indexing and 3-D sizes
135 /// + ids for predicate generation.
136 template <typename ThreadOrBlockIdOp>
137 static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) {
138  auto res = [multiplicity](RewriterBase &rewriter, Location loc,
139  ArrayRef<int64_t> forallMappingSizes,
140  ArrayRef<int64_t> originalBasis) {
141  IndexType indexType = rewriter.getIndexType();
142  SmallVector<Value> ids{
143  rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x),
144  rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y),
145  rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)};
146  // In the 3-D mapping case, scale the first dimension by the multiplicity.
147  SmallVector<Value> scaledIds = ids;
148  AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
149  scaledIds[0] = cast<Value>(affine::makeComposedFoldedAffineApply(
150  rewriter, loc, d0.floorDiv(multiplicity), {scaledIds[0]}));
151  // In the 3-D mapping case, unscale the first dimension by the multiplicity.
152  SmallVector<int64_t> forallMappingSizeInOriginalBasis(forallMappingSizes);
153  forallMappingSizeInOriginalBasis[0] *= multiplicity;
154  return IdBuilderResult{
155  /*mappingIdOps=*/scaledIds,
156  /*availableMappingSizes=*/SmallVector<int64_t>{originalBasis},
157  // `forallMappingSizes` iterate in the scaled basis, they need to be
158  // scaled back into the original basis to provide tight
159  // activeMappingSizes quantities for predication.
160  /*activeMappingSizes=*/
161  SmallVector<int64_t>{forallMappingSizeInOriginalBasis},
162  /*activeIdOps=*/ids};
163  };
164  return res;
165 }
166 
167 namespace mlir {
168 namespace transform {
169 namespace gpu {
170 
171 GpuIdBuilder::GpuIdBuilder(MLIRContext *ctx, bool useLinearMapping,
172  const MappingIdBuilderFnType &fn)
173  : mappingAttributes(), idBuilder() {
174  if (useLinearMapping) {
175  for (uint64_t d = static_cast<uint64_t>(MappingId::LinearDim0),
176  e = getMaxEnumValForMappingId();
177  d <= e; ++d)
178  mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));
179  } else {
180  for (uint64_t d = static_cast<uint64_t>(MappingId::DimX),
181  e = static_cast<uint64_t>(MappingId::DimZ);
182  d <= e; ++d)
183  mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));
184  }
185 }
186 
188  : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) {
189  return GPUBlockMappingAttr::get(ctx, id);
190  }) {
191  idBuilder = useLinearMapping
192  ? commonLinearIdBuilderFn<BlockIdOp>(/*multiplicity=*/1)
193  : common3DIdBuilderFn<BlockIdOp>(/*multiplicity=*/1);
194 }
195 
197  bool useLinearMapping)
198  : GpuIdBuilder(ctx, useLinearMapping,
199  [](MLIRContext *ctx, MappingId id) {
200  return GPUWarpgroupMappingAttr::get(ctx, id);
201  }),
202  warpSize(warpSize) {
203  idBuilder = useLinearMapping
204  ? commonLinearIdBuilderFn<ThreadIdOp>(
205  /*multiplicity=*/kNumWarpsPerGroup * warpSize)
206  : common3DIdBuilderFn<ThreadIdOp>(
207  /*multiplicity=*/kNumWarpsPerGroup * warpSize);
208 }
209 
211  bool useLinearMapping)
212  : GpuIdBuilder(ctx, useLinearMapping,
213  [](MLIRContext *ctx, MappingId id) {
214  return GPUWarpMappingAttr::get(ctx, id);
215  }),
216  warpSize(warpSize) {
217  idBuilder =
218  useLinearMapping
219  ? commonLinearIdBuilderFn<ThreadIdOp>(/*multiplicity=*/warpSize)
220  : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/warpSize);
221 }
222 
224  : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) {
225  return GPUThreadMappingAttr::get(ctx, id);
226  }) {
227  idBuilder = useLinearMapping
228  ? commonLinearIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1)
229  : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1);
230 }
231 
232 DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
233  std::optional<int64_t> gridDimX,
234  std::optional<int64_t> gridDimY,
235  std::optional<int64_t> gridDimZ,
236  std::optional<int64_t> blockDimX,
237  std::optional<int64_t> blockDimY,
238  std::optional<int64_t> blockDimZ) {
239 
240  // TODO: pass a configuration object to set the limits properly.
241 
242  if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
244  (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
246  blockDimX.value_or(1) > kMaxBlockdimx ||
247  blockDimY.value_or(1) > kMaxBlockdimy ||
248  blockDimZ.value_or(1) > kMaxBlockdimz ||
249  gridDimY.value_or(1) > kMaxGriddimy ||
250  gridDimZ.value_or(1) > kMaxGriddimz ||
251  gridDimX.value_or(1) > kMaxGriddimx) {
252  return transformOp.emitSilenceableError()
253  << "Trying to launch a GPU kernel with grid_dims = ("
254  << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
255  << gridDimZ.value_or(1) << ") block_dims = ("
256  << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
257  << blockDimZ.value_or(1) << "). It is larger than the limits.";
258  }
260 }
261 
263  RewriterBase &rewriter, Location loc, TransformOpInterface transformOp,
264  LaunchOp &launchOp, std::optional<int64_t> gridDimX,
265  std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
266  std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
267  std::optional<int64_t> blockDimZ) {
269  checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
270  blockDimY, blockDimZ);
271  if (!diag.succeeded())
272  return diag;
273 
274  auto createConst = [&](int dim) {
275  return rewriter.create<arith::ConstantIndexOp>(loc, dim);
276  };
277  OpBuilder::InsertionGuard guard(rewriter);
278  Value one = createConst(1);
279  Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
280  Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
281  Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
282  Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
283  Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
284  Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
285  launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
286  blkSizeX, blkSizeY, blkSizeZ);
287  rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
288  rewriter.create<TerminatorOp>(loc);
290 }
291 
292 /// Alter kernel configuration of the given kernel.
294  RewriterBase &rewriter, LaunchOp gpuLaunch,
295  TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
296  std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
297  std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
298  std::optional<int64_t> blockDimZ) {
300  checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
301  blockDimY, blockDimZ);
302  if (!diag.succeeded())
303  return diag;
304 
305  KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
306  OpBuilder::InsertionGuard guard(rewriter);
307  rewriter.setInsertionPointAfterValue(currentBlockdim.x);
308  auto createConstValue = [&](int dim) {
309  return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
310  dim);
311  };
312 
313  if (gridDimX.has_value())
314  gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
315  if (gridDimY.has_value())
316  gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
317  if (gridDimZ.has_value())
318  gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
319  if (blockDimX.has_value())
320  gpuLaunch.getBlockSizeXMutable().assign(
321  createConstValue(blockDimX.value()));
322  if (blockDimY.has_value())
323  gpuLaunch.getBlockSizeYMutable().assign(
324  createConstValue(blockDimY.value()));
325  if (blockDimZ.has_value())
326  gpuLaunch.getBlockSizeZMutable().assign(
327  createConstValue(blockDimZ.value()));
329 }
330 
331 } // namespace gpu
332 } // namespace transform
333 } // namespace mlir
static Value createConst(Location loc, Type type, int value, PatternRewriter &rewriter)
Create an integer or index constant.
Definition: ExpandOps.cpp:27
static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity=1)
Create a linear id builder that takes the originalBasisOfr and decompose it in the basis of forallMap...
Definition: Utils.cpp:79
static Value buildLinearId(RewriterBase &rewriter, Location loc, ArrayRef< OpFoldResult > originalBasisOfr)
Return a flattened thread id for the workgroup with given sizes.
Definition: Utils.cpp:51
static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity=1)
Create a simple 3-D id builder that takes the originalBasisOfr The 3-D id builder returns a 3-D vecto...
Definition: Utils.cpp:137
#define DBGS()
Definition: Utils.cpp:45
static std::string diag(const llvm::Value &value)
constexpr int kMaxGriddimz
Definition: NVGPUDialect.h:37
constexpr int kMaxTotalBlockdim
Definition: NVGPUDialect.h:30
constexpr int kMaxGriddimy
Definition: NVGPUDialect.h:36
constexpr int kMaxBlockdimx
Definition: NVGPUDialect.h:31
constexpr int kMaxBlockdimz
Definition: NVGPUDialect.h:33
constexpr int kMaxGriddimx
Definition: NVGPUDialect.h:35
constexpr int kMaxBlockdimy
Definition: NVGPUDialect.h:32
constexpr int kMaxTotalGriddim
Definition: NVGPUDialect.h:34
Base type for affine expression.
Definition: AffineExpr.h:68
AffineExpr floorDiv(uint64_t v) const
Definition: AffineExpr.cpp:921
MLIRContext * getContext() const
Definition: Builders.h:56
IndexType getIndexType()
Definition: Builders.cpp:51
The result of a transform IR operation application.
static DiagnosedSilenceableFailure success()
Constructs a DiagnosedSilenceableFailure in the success state.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:66
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:346
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:434
void setInsertionPointAfterValue(Value val)
Sets the insertion point to the node after the specified value.
Definition: Builders.h:419
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:453
This class represents a single result from folding an operation.
Definition: OpDefinition.h:271
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:412
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Specialization of arith.constant op that returns an integer of index type.
Definition: Arith.h:93
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
Definition: AffineOps.cpp:1158
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Definition: AffineOps.cpp:1208
DiagnosedSilenceableFailure alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Alter kernel configuration of the given kernel.
DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)
Create an empty-body gpu::LaunchOp using the provided kernel settings and put a terminator within.
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional< int64_t > gridDimX, std::optional< int64_t > gridDimY, std::optional< int64_t > gridDimZ, std::optional< int64_t > blockDimX, std::optional< int64_t > blockDimY, std::optional< int64_t > blockDimZ)
Determine if the size of the kernel configuration is supported by the GPU architecture being used.
Definition: Utils.cpp:232
std::function< IdBuilderResult(RewriterBase &, Location, ArrayRef< int64_t >, ArrayRef< int64_t >)> GpuIdBuilderFnType
Common gpu id builder type, allows the configuration of lowering for various mapping schemes.
Definition: Utils.h:59
Include the generated interface declarations.
OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)
Convert int64_t to integer attributes of index type and return them as OpFoldResult.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:348
SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)
Definition: IndexingUtils.h:47
SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)
Given the strides together with a linear index in the dimension space, return the vector-space offset...
int64_t computeProduct(ArrayRef< int64_t > basis)
Self-explicit.
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:362
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:112
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
Definition: AffineExpr.cpp:621
Utility class for the GPU dialect to represent triples of Values accessible through ....
Definition: GPUDialect.h:39
GpuBlockIdBuilder(MLIRContext *ctx, bool useLinearMapping=false)
Definition: Utils.cpp:187
Helper struct for configuring the rewrite of mapped scf.forall ops to various gpu id configurations.
Definition: Utils.h:63
SmallVector< DeviceMappingAttrInterface > mappingAttributes
The mapping attributes targeted by this generator.
Definition: Utils.h:72
GpuIdBuilderFnType idBuilder
The constructor that builds the concrete IR for mapping ids.
Definition: Utils.h:75
std::function< DeviceMappingAttrInterface(MLIRContext *, mlir::gpu::MappingId)> MappingIdBuilderFnType
Definition: Utils.h:65
GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping=false)
Definition: Utils.cpp:223
GpuWarpIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping=false)
Definition: Utils.cpp:210
static constexpr int64_t kNumWarpsPerGroup
In the future this may be configured by the transformation.
Definition: Utils.h:97
GpuWarpgroupIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping=false)
Definition: Utils.cpp:196
Helper type for functions that generate ids for the mapping of a scf.forall.
Definition: Utils.h:38