MLIR  16.0.0git
VectorToGPU.cpp
Go to the documentation of this file.
1 //===- VectorToGPU.cpp - Convert vector to GPU dialect ----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements lowering of vector operations to GPU dialect ops.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <type_traits>
14 
15 #include "NvGpuSupport.h"
17 
18 #include "../PassDetail.h"
28 #include "mlir/IR/Builders.h"
29 #include "mlir/Pass/Pass.h"
31 #include "mlir/Transforms/Passes.h"
32 #include "llvm/ADT/TypeSwitch.h"
33 
34 using namespace mlir;
35 
36 /// For a vector TransferOpType `xferOp`, an empty `indices` vector, and an
37 /// AffineMap representing offsets to apply to indices, the function fills
38 /// `indices` with the original indices plus the offsets. The offsets are
39 /// applied by taking into account the permutation map of the transfer op. If
40 /// the `offsetMap` has dimension placeholders, those should be provided in
41 /// `dimValues`.
42 template <typename TransferOpType>
43 static void getXferIndices(OpBuilder &b, TransferOpType xferOp,
44  AffineMap offsetMap, ArrayRef<Value> dimValues,
45  SmallVector<Value, 4> &indices) {
46  indices.append(xferOp.getIndices().begin(), xferOp.getIndices().end());
47  Location loc = xferOp.getLoc();
48  unsigned offsetsIdx = 0;
49  for (auto expr : xferOp.getPermutationMap().getResults()) {
50  if (auto dim = expr.template dyn_cast<AffineDimExpr>()) {
51  Value prevIdx = indices[dim.getPosition()];
52  SmallVector<Value, 3> dims(dimValues.begin(), dimValues.end());
53  dims.push_back(prevIdx);
54  AffineExpr d0 = b.getAffineDimExpr(offsetMap.getNumDims());
55  indices[dim.getPosition()] = makeComposedAffineApply(
56  b, loc, d0 + offsetMap.getResult(offsetsIdx++), dims);
57  continue;
58  }
59  }
60 }
61 
62 // Return true if the contract op can be convert to MMA matmul.
63 static bool contractSupportsMMAMatrixType(vector::ContractionOp contract,
64  bool useNvGpu) {
65  if (llvm::size(contract.getMasks()) != 0)
66  return false;
67 
68  using MapList = ArrayRef<ArrayRef<AffineExpr>>;
69  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
70  AffineExpr m, n, k;
71  bindDims(contract.getContext(), m, n, k);
72  auto iteratorTypes = contract.getIteratorTypes().getValue();
73  if (!(isParallelIterator(iteratorTypes[0]) &&
74  isParallelIterator(iteratorTypes[1]) &&
75  isReductionIterator(iteratorTypes[2])))
76  return false;
77 
78  // The contract needs to represent a matmul to be able to convert to
79  // MMAMatrix matmul.
80  if (!useNvGpu &&
81  contract.getIndexingMapsArray() != infer({{m, k}, {k, n}, {m, n}}))
82  return false;
83  if (useNvGpu &&
84  contract.getIndexingMapsArray() != infer({{m, k}, {n, k}, {m, n}}))
85  return false;
86 
87  return true;
88 }
89 
90 // Return the stide for the dimension 0 of |type| if it is a memref and has a
91 // constant stride.
94  auto memrefType = type.dyn_cast<MemRefType>();
95  if (!memrefType)
96  return false;
97  // If the memref is 0 or 1D the horizontal stride is 0.
98  if (memrefType.getRank() < 2)
99  return 0;
100  int64_t offset = 0;
101  SmallVector<int64_t, 2> strides;
102  if (failed(getStridesAndOffset(memrefType, strides, offset)) ||
103  strides.back() != 1)
104  return llvm::None;
105  int64_t stride = strides[strides.size() - 2];
106  if (stride == ShapedType::kDynamicStrideOrOffset)
107  return llvm::None;
108  return stride;
109 }
110 
111 // Return true if the transfer op can be converted to a MMA matrix load.
112 static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp,
113  bool useNvGpu) {
114  if (readOp.getMask() || readOp.hasOutOfBoundsDim() ||
115  readOp.getVectorType().getRank() != 2)
116  return false;
117  if (!getMemrefConstantHorizontalStride(readOp.getShapedType()))
118  return false;
119  AffineMap map = readOp.getPermutationMap();
120  OpBuilder b(readOp.getContext());
121  AffineExpr innerDim = b.getAffineDimExpr(map.getNumDims() - 1);
122  AffineExpr zero = b.getAffineConstantExpr(0);
123  auto broadcastInnerDim = AffineMap::get(map.getNumDims(), 0, {zero, innerDim},
124  readOp.getContext());
125 
126  if (!useNvGpu) {
127  // TODO: Support transpose once it is added to GPU dialect ops.
128  // For now we only support (d0, d1) -> (d0, d1) and (d0, d1) -> (0, d1).
129  return map.isMinorIdentity() || map == broadcastInnerDim;
130  }
131 
132  return true;
133 }
134 
135 // Return true if the transfer op can be converted to a MMA matrix store.
136 static bool
137 transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp) {
138  // TODO: support 0-d corner case.
139  if (writeOp.getTransferRank() == 0)
140  return false;
141 
142  if (writeOp.getMask() || writeOp.hasOutOfBoundsDim() ||
143  writeOp.getVectorType().getRank() != 2)
144  return false;
145  if (!getMemrefConstantHorizontalStride(writeOp.getShapedType()))
146  return false;
147  // TODO: Support transpose once it is added to GPU dialect ops.
148  if (!writeOp.getPermutationMap().isMinorIdentity())
149  return false;
150  return true;
151 }
152 
153 /// Return true if the constant is a splat to a 2D vector so that it can be
154 /// converted to a MMA constant matrix op.
155 static bool constantSupportsMMAMatrixType(arith::ConstantOp constantOp) {
156  auto vecType = constantOp.getType().dyn_cast<VectorType>();
157  if (!vecType || vecType.getRank() != 2)
158  return false;
159  return constantOp.getValue().isa<SplatElementsAttr>();
160 }
161 
162 /// Return true if this is a broadcast from scalar to a 2D vector.
163 static bool broadcastSupportsMMAMatrixType(vector::BroadcastOp broadcastOp) {
164  return broadcastOp.getVectorType().getRank() == 2 &&
165  broadcastOp.getSource().getType().isa<FloatType>();
166 }
167 
168 /// Return the MMA elementwise enum associated with `op` if it is supported.
169 /// Return `llvm::None` otherwise.
172  if (isa<arith::AddFOp>(op))
173  return gpu::MMAElementwiseOp::ADDF;
174  if (isa<arith::MulFOp>(op))
175  return gpu::MMAElementwiseOp::MULF;
176  if (isa<arith::MaxFOp>(op))
177  return gpu::MMAElementwiseOp::MAXF;
178  if (isa<arith::MinFOp>(op))
179  return gpu::MMAElementwiseOp::MINF;
180  if (isa<arith::DivFOp>(op))
181  return gpu::MMAElementwiseOp::DIVF;
182  return llvm::None;
183 }
184 
185 /// Return true if the op is supported as elementwise op on MMAMatrix type.
187  return convertElementwiseOpToMMA(op).has_value();
188 }
189 
190 static bool supportsMMaMatrixType(Operation *op, bool useNvGpu) {
191  if (isa<scf::ForOp, scf::YieldOp>(op))
192  return true;
193  if (auto transferRead = dyn_cast<vector::TransferReadOp>(op))
194  return transferReadSupportsMMAMatrixType(transferRead, useNvGpu);
195  if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op))
196  return transferWriteSupportsMMAMatrixType(transferWrite);
197  if (auto contract = dyn_cast<vector::ContractionOp>(op))
198  return contractSupportsMMAMatrixType(contract, useNvGpu);
199  if (auto constant = dyn_cast<arith::ConstantOp>(op))
200  return constantSupportsMMAMatrixType(constant);
201  if (auto broadcast = dyn_cast<vector::BroadcastOp>(op))
204 }
205 
206 /// Return an unsorted slice handling scf.for region differently than
207 /// `getSlice`. In scf.for we only want to include as part of the slice elements
208 /// that are part of the use/def chain.
210  TransitiveFilter backwardFilter,
211  TransitiveFilter forwardFilter) {
213  slice.insert(op);
214  unsigned currentIndex = 0;
215  SetVector<Operation *> backwardSlice;
216  SetVector<Operation *> forwardSlice;
217  while (currentIndex != slice.size()) {
218  auto *currentOp = (slice)[currentIndex];
219  // Compute and insert the backwardSlice starting from currentOp.
220  backwardSlice.clear();
221  getBackwardSlice(currentOp, &backwardSlice, backwardFilter);
222  slice.insert(backwardSlice.begin(), backwardSlice.end());
223 
224  // Compute and insert the forwardSlice starting from currentOp.
225  forwardSlice.clear();
226  // Special case for ForOp, we don't want to include the whole region but
227  // only the value using the region arguments.
228  // TODO: We should refine this to only care about the region arguments being
229  // converted to matrix type.
230  if (auto forOp = dyn_cast<scf::ForOp>(currentOp)) {
231  for (Value forOpResult : forOp.getResults())
232  getForwardSlice(forOpResult, &forwardSlice, forwardFilter);
233  for (BlockArgument &arg : forOp.getRegionIterArgs())
234  getForwardSlice(arg, &forwardSlice, forwardFilter);
235  } else {
236  getForwardSlice(currentOp, &forwardSlice, forwardFilter);
237  }
238  slice.insert(forwardSlice.begin(), forwardSlice.end());
239  ++currentIndex;
240  }
241  return slice;
242 }
243 
244 // Analyze slice of operations based on convert op to figure out if the whole
245 // slice can be converted to MMA operations.
247  bool useNvGpu) {
248  auto hasVectorDest = [](Operation *op) {
249  return llvm::any_of(op->getResultTypes(),
250  [](Type t) { return t.isa<VectorType>(); });
251  };
252  auto hasVectorSrc = [](Operation *op) {
253  return llvm::any_of(op->getOperandTypes(),
254  [](Type t) { return t.isa<VectorType>(); });
255  };
256  SetVector<Operation *> opToConvert;
257  op->walk([&](vector::ContractionOp contract) {
258  if (opToConvert.contains(contract.getOperation()))
259  return;
260  SetVector<Operation *> dependentOps =
261  getSliceContract(contract, hasVectorDest, hasVectorSrc);
262  // If any instruction cannot use MMA matrix type drop the whole
263  // chain. MMA matrix are stored in an opaque type so they cannot be used
264  // by all operations.
265  if (llvm::any_of(dependentOps, [useNvGpu](Operation *op) {
266  return !supportsMMaMatrixType(op, useNvGpu);
267  }))
268  return;
269  opToConvert.insert(dependentOps.begin(), dependentOps.end());
270  });
271  // Sort the operations so that we can convert them in topological order.
272  return topologicalSort(opToConvert);
273 }
274 
275 namespace {
276 // Transform contract into (m, k)x(k, n)x(m, n) form so that it can be converted
277 // to MMA matmul.
278 struct PrepareContractToGPUMMA
279  : public OpRewritePattern<vector::ContractionOp> {
281 
282  LogicalResult matchAndRewrite(vector::ContractionOp op,
283  PatternRewriter &rewriter) const override {
284  Location loc = op.getLoc();
285  Value lhs = op.getLhs(), rhs = op.getRhs(), res = op.getAcc();
286 
287  // Set up the parallel/reduction structure in right form.
288  using MapList = ArrayRef<ArrayRef<AffineExpr>>;
289  auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
290  AffineExpr m, n, k;
291  bindDims(rewriter.getContext(), m, n, k);
292  static constexpr std::array<int64_t, 2> perm = {1, 0};
293  auto iteratorTypes = op.getIteratorTypes().getValue();
294  SmallVector<AffineMap, 4> maps = op.getIndexingMapsArray();
295  if (!(isParallelIterator(iteratorTypes[0]) &&
296  isParallelIterator(iteratorTypes[1]) &&
297  isReductionIterator(iteratorTypes[2])))
298  return failure();
299  //
300  // Two outer parallel, one inner reduction (matmat flavor).
301  //
302  if (maps == infer({{m, k}, {k, n}, {m, n}})) {
303  // This is the classical row-major matmul, nothing to do.
304  return failure();
305  }
306  if (maps == infer({{m, k}, {n, k}, {m, n}})) {
307  rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
308  } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
309  lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
310  } else if (maps == infer({{k, m}, {n, k}, {m, n}})) {
311  rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
312  lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
313  } else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
314  std::swap(rhs, lhs);
315  rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
316  lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
317  } else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
318  std::swap(rhs, lhs);
319  rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
320  } else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
321  std::swap(lhs, rhs);
322  lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
323  } else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
324  std::swap(lhs, rhs);
325  } else {
326  return failure();
327  }
328  rewriter.replaceOpWithNewOp<vector::ContractionOp>(
329  op, lhs, rhs, res,
330  rewriter.getAffineMapArrayAttr(infer({{m, k}, {k, n}, {m, n}})),
331  op.getIteratorTypes());
332  return success();
333  }
334 };
335 
336 // Merge transpose op into the transfer read op. Transpose are not supported on
337 // MMA types but MMA load can transpose the matrix when loading.
338 struct CombineTransferReadOpTranspose final
339  : public OpRewritePattern<vector::TransposeOp> {
341 
342  LogicalResult matchAndRewrite(vector::TransposeOp op,
343  PatternRewriter &rewriter) const override {
344  auto transferReadOp =
345  op.getVector().getDefiningOp<vector::TransferReadOp>();
346  if (!transferReadOp)
347  return failure();
348 
349  // TODO: support 0-d corner case.
350  if (transferReadOp.getTransferRank() == 0)
351  return failure();
352 
353  if (transferReadOp.getMask() || transferReadOp.hasOutOfBoundsDim())
354  return failure();
356  op.getTransp(perm);
358  for (int64_t o : perm)
359  permU.push_back(unsigned(o));
360  AffineMap permutationMap =
361  AffineMap::getPermutationMap(permU, op.getContext());
362  AffineMap newMap =
363  permutationMap.compose(transferReadOp.getPermutationMap());
364  rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
365  op, op.getType(), transferReadOp.getSource(),
366  transferReadOp.getIndices(), AffineMapAttr::get(newMap),
367  transferReadOp.getPadding(), transferReadOp.getMask(),
368  transferReadOp.getInBoundsAttr());
369  return success();
370  }
371 };
372 
373 } // namespace
374 
375 // MMA types have different layout based on how they are used in matmul ops.
376 // Figure the right layout to use by looking at op uses.
377 // TODO: Change the GPU dialect to abstract the layout at the this level and
378 // only care about it during lowering to NVVM.
379 template <typename OpTy>
380 static const char *inferFragType(OpTy op) {
381  for (Operation *users : op->getUsers()) {
382  auto contract = dyn_cast<vector::ContractionOp>(users);
383  if (!contract)
384  continue;
385  if (contract.getLhs() == op.getResult())
386  return "AOp";
387  if (contract.getRhs() == op.getResult())
388  return "BOp";
389  }
390  return "COp";
391 }
392 
393 static void convertTransferReadOp(vector::TransferReadOp op,
394  llvm::DenseMap<Value, Value> &valueMapping) {
395  assert(op.getTransferRank() > 0 && "unexpected 0-d transfer");
396  assert(transferReadSupportsMMAMatrixType(op, /*useNvGpu=*/false));
397  Optional<int64_t> stride =
398  getMemrefConstantHorizontalStride(op.getShapedType());
399  AffineMap map = op.getPermutationMap();
400  // Handle broadcast by setting the stride to 0.
401  if (map.getResult(0).isa<AffineConstantExpr>()) {
402  assert(map.getResult(0).cast<AffineConstantExpr>().getValue() == 0);
403  stride = 0;
404  }
405  assert(stride);
406  const char *fragType = inferFragType(op);
407  gpu::MMAMatrixType type =
408  gpu::MMAMatrixType::get(op.getVectorType().getShape(),
409  op.getVectorType().getElementType(), fragType);
410  OpBuilder b(op);
411  Value load = b.create<gpu::SubgroupMmaLoadMatrixOp>(
412  op.getLoc(), type, op.getSource(), op.getIndices(),
413  b.getIndexAttr(*stride));
414  valueMapping[op.getResult()] = load;
415 }
416 
417 static void convertTransferWriteOp(vector::TransferWriteOp op,
418  llvm::DenseMap<Value, Value> &valueMapping) {
420  Optional<int64_t> stride =
421  getMemrefConstantHorizontalStride(op.getShapedType());
422  assert(stride);
423  OpBuilder b(op);
424  Value matrix = valueMapping.find(op.getVector())->second;
425  b.create<gpu::SubgroupMmaStoreMatrixOp>(op.getLoc(), matrix, op.getSource(),
426  op.getIndices(),
427  b.getIndexAttr(*stride));
428  op.erase();
429 }
430 
431 /// Returns the vector type which represents a matrix fragment.
432 static VectorType
435  regInfo.elementsPerRegister};
436  Type elType = regInfo.registerLLVMType;
437  if (auto vecType = elType.dyn_cast<VectorType>())
438  elType = vecType.getElementType();
439  return VectorType::get(shape, elType);
440 }
441 
442 /// Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.
443 static LogicalResult
444 convertConstantOpMmaSync(arith::ConstantOp op,
445  llvm::DenseMap<Value, Value> &valueMapping) {
446  OpBuilder b(op);
447  FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
449  if (failed(warpMatrixInfo))
450  return failure();
451 
453  nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
454  if (failed(regInfo))
455  return failure();
456 
457  VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
458  auto dense = op.getValue().dyn_cast<SplatElementsAttr>();
459  if (!dense)
460  return failure();
461  Value result = b.create<arith::ConstantOp>(
462  op.getLoc(), vectorType,
463  DenseElementsAttr::get(vectorType, dense.getSplatValue<Attribute>()));
464  valueMapping[op.getResult()] = result;
465  return success();
466 }
467 
468 static LogicalResult
469 creatLdMatrixCompatibleLoads(vector::TransferReadOp op, OpBuilder &builder,
470  llvm::DenseMap<Value, Value> &valueMapping) {
471  Location loc = op->getLoc();
472 
473  FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
475  if (failed(warpMatrixInfo))
476  return failure();
477 
479  nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
480  if (failed(regInfo))
481  return failure();
482 
484  *warpMatrixInfo,
485  /*transpose=*/!op.getPermutationMap().isMinorIdentity());
486  if (failed(params)) {
487  return op->emitError()
488  << "failed to convert vector.transfer_read to ldmatrix; this op "
489  "likely "
490  "should not be converted to a nvgpu.ldmatrix call.";
491  }
492 
493  // Adjust the load offset.
494  auto laneId = builder.create<gpu::LaneIdOp>(loc);
495  FailureOr<AffineMap> offsets =
496  nvgpu::getLaneIdToLdMatrixMatrixCoord(loc, builder, *params);
497  if (failed(offsets))
498  return failure();
499 
500  VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
501 
502  SmallVector<Value, 4> indices;
503  getXferIndices<vector::TransferReadOp>(builder, op, *offsets, {laneId},
504  indices);
505  nvgpu::LdMatrixOp newOp = builder.create<nvgpu::LdMatrixOp>(
506  loc, vectorType, op.getSource(), indices,
507  !op.getPermutationMap().isMinorIdentity(), params->numTiles);
508  valueMapping[op] = newOp->getResult(0);
509  return success();
510 }
511 
512 static LogicalResult
513 createNonLdMatrixLoads(vector::TransferReadOp op, OpBuilder &builder,
514  llvm::DenseMap<Value, Value> &valueMapping) {
515  Location loc = op.getLoc();
516  FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
518  if (failed(warpMatrixInfo))
519  return failure();
521  nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
522  if (failed(regInfo)) {
523  op->emitError() << "Failed to deduce register fragment type during "
524  "conversion to distributed non-ldmatrix compatible load";
525  return failure();
526  }
527 
528  Value laneId = builder.create<gpu::LaneIdOp>(loc);
529  SmallVector<Value, 4> elements;
530 
531  // This is the individual element type.
532  Type loadedElType = regInfo->registerLLVMType;
533  VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
534 
535  Value fill = builder.create<arith::ConstantOp>(
536  op.getLoc(), vectorType.getElementType(),
537  builder.getZeroAttr(vectorType.getElementType()));
538  Value result = builder.create<vector::SplatOp>(op.getLoc(), fill, vectorType);
539 
540  bool isTransposeLoad = !op.getPermutationMap().isMinorIdentity();
541 
542  // If we are not transposing, then we can use vectorized loads. Otherwise, we
543  // must load each element individually.
544  if (!isTransposeLoad) {
545  if (!loadedElType.isa<VectorType>()) {
546  loadedElType = VectorType::get({1}, loadedElType);
547  }
548 
549  for (int i = 0; i < vectorType.getShape()[0]; i++) {
551  op.getLoc(), builder, *warpMatrixInfo);
552  if (failed(coords))
553  return failure();
554  Value logicalValueId = builder.create<arith::ConstantOp>(
555  loc, builder.getIndexType(),
556  builder.getIndexAttr(i * regInfo->elementsPerRegister));
557  SmallVector<Value, 4> newIndices;
558  getXferIndices<vector::TransferReadOp>(
559  builder, op, *coords, {laneId, logicalValueId}, newIndices);
560 
561  Value el = builder.create<vector::LoadOp>(loc, loadedElType,
562  op.getSource(), newIndices);
563  result = builder.create<vector::InsertOp>(loc, el, result,
564  builder.getI64ArrayAttr(i));
565  }
566  } else {
567  if (auto vecType = loadedElType.dyn_cast<VectorType>()) {
568  loadedElType = vecType.getElementType();
569  }
570  for (int i = 0; i < vectorType.getShape()[0]; i++) {
571  for (unsigned innerIdx = 0; innerIdx < vectorType.getShape()[1];
572  innerIdx++) {
573 
574  Value logicalValueId = builder.create<arith::ConstantOp>(
575  loc, builder.getIndexType(),
576  builder.getIndexAttr(i * regInfo->elementsPerRegister + innerIdx));
578  op.getLoc(), builder, *warpMatrixInfo);
579  if (failed(coords))
580  return failure();
581 
582  SmallVector<Value, 4> newIndices;
583  getXferIndices<vector::TransferReadOp>(
584  builder, op, *coords, {laneId, logicalValueId}, newIndices);
585  Value el = builder.create<memref::LoadOp>(op.getLoc(), loadedElType,
586  op.getSource(), newIndices);
587  result = builder.create<vector::InsertOp>(
588  op.getLoc(), el, result, builder.getI64ArrayAttr({i, innerIdx}));
589  }
590  }
591  }
592 
593  valueMapping[op.getResult()] = result;
594  return success();
595 }
596 
597 /// Converts a `vector.transfer_read` operation directly to either a
598 /// `vector.load` or a `nvgpu.ldmatrix` operation. This function should only be
599 /// used when converting to `nvgpu.mma.sync` operations.
600 static LogicalResult
601 convertTransferReadToLoads(vector::TransferReadOp op,
602  llvm::DenseMap<Value, Value> &valueMapping) {
603  OpBuilder b(op);
604 
605  FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
607  if (failed(warpMatrixInfo))
608  return failure();
609 
610  bool isLdMatrixCompatible =
611  op.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt() == 3 &&
612  nvgpu::inferTileWidthInBits(*warpMatrixInfo) == 128;
613 
614  VectorType vecTy = op.getVectorType();
615  int64_t bitWidth = vecTy.getElementType().getIntOrFloatBitWidth();
616 
617  // When we are transposing the B operand, ldmatrix will only work if we have
618  // at least 8 rows to read and the width to read for the transpose is 128
619  // bits.
620  if (!op.getPermutationMap().isMinorIdentity() &&
621  (bitWidth != 16 || vecTy.getDimSize(1) < 8 ||
622  vecTy.getDimSize(0) * bitWidth < 128))
623  isLdMatrixCompatible = false;
624 
625  if (!isLdMatrixCompatible)
626  return createNonLdMatrixLoads(op, b, valueMapping);
627 
628  return creatLdMatrixCompatibleLoads(op, b, valueMapping);
629 }
630 
631 static LogicalResult
632 convertTransferWriteToStores(vector::TransferWriteOp op,
633  llvm::DenseMap<Value, Value> &valueMapping) {
634  OpBuilder b(op);
635  Location loc = op->getLoc();
636  Value matrix = valueMapping.find(op.getVector())->second;
637 
638  FailureOr<nvgpu::WarpMatrixInfo> warpMatrixInfo =
640  if (failed(warpMatrixInfo))
641  return failure();
643  nvgpu::getMmaSyncRegisterType(*warpMatrixInfo);
644  if (failed(regInfo))
645  return failure();
646 
647  VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
648  Value laneId = b.create<gpu::LaneIdOp>(loc);
649 
650  for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
651  Value logicalValueId = b.create<arith::ConstantOp>(
652  loc, b.getIndexType(),
653  b.getIndexAttr(i * regInfo->elementsPerRegister));
655  op.getLoc(), b, *warpMatrixInfo);
656  if (failed(coords))
657  return failure();
658 
659  Value el = b.create<vector::ExtractOp>(loc, matrix, ArrayRef<int64_t>{i});
660  SmallVector<Value, 4> newIndices;
661  getXferIndices<vector::TransferWriteOp>(
662  b, op, *coords, {laneId, logicalValueId}, newIndices);
663  b.create<vector::StoreOp>(loc, el, op.getSource(), newIndices);
664  }
665  op->erase();
666  return success();
667 }
668 
669 static void convertContractOp(vector::ContractionOp op,
670  llvm::DenseMap<Value, Value> &valueMapping) {
671  OpBuilder b(op);
672  Value opA = valueMapping.find(op.getLhs())->second;
673  Value opB = valueMapping.find(op.getRhs())->second;
674  Value opC = valueMapping.find(op.getAcc())->second;
675  Value matmul = b.create<gpu::SubgroupMmaComputeOp>(op.getLoc(), opC.getType(),
676  opA, opB, opC);
677  valueMapping[op.getResult()] = matmul;
678 }
679 
680 static LogicalResult
681 convertContractOpToMmaSync(vector::ContractionOp op,
682  llvm::DenseMap<Value, Value> &valueMapping) {
683  OpBuilder b(op);
684  Value opA = valueMapping.find(op.getLhs())->second;
685  Value opB = valueMapping.find(op.getRhs())->second;
686  Value opC = valueMapping.find(op.getAcc())->second;
687  int64_t m = op.getLhs().getType().cast<VectorType>().getShape()[0];
688  int64_t n = op.getRhs().getType().cast<VectorType>().getShape()[0];
689  int64_t k = op.getLhs().getType().cast<VectorType>().getShape()[1];
690  Value matmul = b.create<nvgpu::MmaSyncOp>(op.getLoc(), opA, opB, opC,
691  b.getI64ArrayAttr({m, n, k}));
692  valueMapping[op.getResult()] = matmul;
693  return success();
694 }
695 
696 /// Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.
697 static void convertConstantOp(arith::ConstantOp op,
698  llvm::DenseMap<Value, Value> &valueMapping) {
699  assert(constantSupportsMMAMatrixType(op));
700  OpBuilder b(op);
701  auto splat =
702  op.getValue().cast<SplatElementsAttr>().getSplatValue<TypedAttr>();
703  auto scalarConstant =
704  b.create<arith::ConstantOp>(op.getLoc(), splat.getType(), splat);
705  const char *fragType = inferFragType(op);
706  auto vecType = op.getType().cast<VectorType>();
708  vecType.getShape(), vecType.getElementType(), llvm::StringRef(fragType));
709  auto matrix = b.create<gpu::SubgroupMmaConstantMatrixOp>(op.getLoc(), type,
710  scalarConstant);
711  valueMapping[op.getResult()] = matrix;
712 }
713 
714 /// Convert a vector.broadcast from scalar to a SubgroupMmaConstantMatrix op.
715 static void convertBroadcastOp(vector::BroadcastOp op,
716  llvm::DenseMap<Value, Value> &valueMapping) {
717  assert(broadcastSupportsMMAMatrixType(op));
718  OpBuilder b(op);
719  const char *fragType = inferFragType(op);
720  auto vecType = op.getVectorType();
722  vecType.getShape(), vecType.getElementType(), llvm::StringRef(fragType));
723  auto matrix = b.create<gpu::SubgroupMmaConstantMatrixOp>(op.getLoc(), type,
724  op.getSource());
725  valueMapping[op.getResult()] = matrix;
726 }
727 
728 // Replace ForOp with a new ForOp with extra operands. The YieldOp is not
729 // updated and needs to be updated separatly for the loop to be correct.
730 static scf::ForOp replaceForOpWithNewSignature(OpBuilder &b, scf::ForOp loop,
731  ValueRange newIterOperands) {
732  // Create a new loop before the existing one, with the extra operands.
734  b.setInsertionPoint(loop);
735  auto operands = llvm::to_vector<4>(loop.getIterOperands());
736  operands.append(newIterOperands.begin(), newIterOperands.end());
737  scf::ForOp newLoop =
738  b.create<scf::ForOp>(loop.getLoc(), loop.getLowerBound(),
739  loop.getUpperBound(), loop.getStep(), operands);
740  newLoop.getBody()->erase();
741  newLoop.getLoopBody().getBlocks().splice(
742  newLoop.getLoopBody().getBlocks().begin(),
743  loop.getLoopBody().getBlocks());
744  for (Value operand : newIterOperands)
745  newLoop.getBody()->addArgument(operand.getType(), operand.getLoc());
746 
747  for (auto it : llvm::zip(loop.getResults(), newLoop.getResults().take_front(
748  loop.getNumResults())))
749  std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
750  loop.erase();
751  return newLoop;
752 }
753 
754 static void convertForOp(scf::ForOp op,
755  llvm::DenseMap<Value, Value> &valueMapping) {
756  SmallVector<Value> newOperands;
758  for (const auto &operand : llvm::enumerate(op.getIterOperands())) {
759  auto it = valueMapping.find(operand.value());
760  if (it == valueMapping.end())
761  continue;
762  argMapping.push_back(std::make_pair(
763  operand.index(), op.getNumIterOperands() + newOperands.size()));
764  newOperands.push_back(it->second);
765  }
766  OpBuilder b(op);
767  scf::ForOp newForOp = replaceForOpWithNewSignature(b, op, newOperands);
768  Block &loopBody = *newForOp.getBody();
769  for (auto mapping : argMapping) {
770  valueMapping[newForOp.getResult(mapping.first)] =
771  newForOp.getResult(mapping.second);
772  valueMapping[loopBody.getArgument(mapping.first +
773  newForOp.getNumInductionVars())] =
774  loopBody.getArgument(mapping.second + newForOp.getNumInductionVars());
775  }
776 }
777 
778 static void convertYieldOp(scf::YieldOp op,
779  llvm::DenseMap<Value, Value> &valueMapping) {
780  OpBuilder b(op);
781  auto loop = cast<scf::ForOp>(op->getParentOp());
782  auto yieldOperands = llvm::to_vector<4>(op.getOperands());
783  for (const auto &operand : llvm::enumerate(op.getOperands())) {
784  auto it = valueMapping.find(operand.value());
785  if (it == valueMapping.end())
786  continue;
787  // Replace the yield of old value with the for op argument to make it easier
788  // to remove the dead code.
789  yieldOperands[operand.index()] = loop.getIterOperands()[operand.index()];
790  yieldOperands.push_back(it->second);
791  }
792  b.create<scf::YieldOp>(op.getLoc(), yieldOperands);
793  op.erase();
794 }
795 
796 /// Convert an elementwise op to the equivalent elementwise op on MMA matrix.
797 static void convertElementwiseOp(Operation *op, gpu::MMAElementwiseOp opType,
798  llvm::DenseMap<Value, Value> &valueMapping) {
799  OpBuilder b(op);
800  SmallVector<Value> matrixOperands;
801  for (Value operand : op->getOperands())
802  matrixOperands.push_back(valueMapping.find(operand)->second);
803  Value newOp = b.create<gpu::SubgroupMmaElementwiseOp>(
804  op->getLoc(), matrixOperands[0].getType(), matrixOperands, opType);
805  valueMapping[op->getResult(0)] = newOp;
806 }
807 
809  bool useNvGpu) {
810  if (!useNvGpu) {
811  patterns.add<PrepareContractToGPUMMA, CombineTransferReadOpTranspose>(
812  patterns.getContext());
813  return;
814  }
815  patterns
816  .add<nvgpu::PrepareContractToGPUMMASync, CombineTransferReadOpTranspose>(
817  patterns.getContext());
818 }
819 
821  SetVector<Operation *> ops = getOpToConvert(rootOp, /*useNvGpu=*/false);
822  llvm::DenseMap<Value, Value> valueMapping;
823  for (Operation *op : ops) {
824  if (auto transferRead = dyn_cast<vector::TransferReadOp>(op)) {
825  convertTransferReadOp(transferRead, valueMapping);
826  } else if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op)) {
827  convertTransferWriteOp(transferWrite, valueMapping);
828  } else if (auto contractOp = dyn_cast<vector::ContractionOp>(op)) {
829  convertContractOp(contractOp, valueMapping);
830  } else if (auto constantOp = dyn_cast<arith::ConstantOp>(op)) {
831  convertConstantOp(constantOp, valueMapping);
832  } else if (auto broadcastOp = dyn_cast<vector::BroadcastOp>(op)) {
833  convertBroadcastOp(broadcastOp, valueMapping);
834  } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
835  convertForOp(forOp, valueMapping);
836  } else if (auto yiledOp = dyn_cast<scf::YieldOp>(op)) {
837  convertYieldOp(yiledOp, valueMapping);
838  } else if (auto elementwiseType = convertElementwiseOpToMMA(op)) {
839  convertElementwiseOp(op, *elementwiseType, valueMapping);
840  }
841  }
842 }
843 
845  SetVector<Operation *> ops = getOpToConvert(rootOp, /*useNvGpu=*/true);
846  llvm::DenseMap<Value, Value> valueMapping;
847  for (Operation *op : ops) {
849  .Case([&](vector::TransferReadOp transferReadOp) {
850  return convertTransferReadToLoads(transferReadOp, valueMapping);
851  })
852  .Case([&](vector::TransferWriteOp transferWriteOp) {
853  return convertTransferWriteToStores(transferWriteOp,
854  valueMapping);
855  })
856  .Case([&](vector::ContractionOp contractionOp) {
857  return convertContractOpToMmaSync(contractionOp, valueMapping);
858  })
859  .Case([&](scf::ForOp forOp) {
860  convertForOp(forOp, valueMapping);
861  return success();
862  })
863  .Case([&](scf::YieldOp yieldOp) {
864  convertYieldOp(yieldOp, valueMapping);
865  return success();
866  })
867  .Case([&](arith::ConstantOp constOp) {
868  return convertConstantOpMmaSync(constOp, valueMapping);
869  })
870  .Default([&](Operation *op) {
871  op->emitError() << "unhandled vector to mma type: " << *op;
872  return failure();
873  })
874  .failed()) {
875  op->emitError() << "Failed to convert op " << *op;
876  return failure();
877  }
878  }
879  return success();
880 }
881 
882 namespace {
883 
884 struct ConvertVectorToGPUPass
885  : public ConvertVectorToGPUBase<ConvertVectorToGPUPass> {
886 
887  explicit ConvertVectorToGPUPass(bool useNvGpu_) {
888  useNvGpu.setValue(useNvGpu_);
889  }
890 
891  void runOnOperation() override {
892  RewritePatternSet patterns(&getContext());
893  populatePrepareVectorToMMAPatterns(patterns, useNvGpu.getValue());
894  if (failed(
895  applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
896  return signalPassFailure();
897 
898  if (useNvGpu.getValue()) {
899  if (failed(convertVectorToNVVMCompatibleMMASync(getOperation())))
900  return signalPassFailure();
901  }
902 
903  (void)convertVectorToMMAOps(getOperation());
904  }
905 };
906 
907 } // namespace
908 
909 std::unique_ptr<Pass> mlir::createConvertVectorToGPUPass(bool useNvGpu) {
910  return std::make_unique<ConvertVectorToGPUPass>(useNvGpu);
911 }
static bool transferWriteSupportsMMAMatrixType(vector::TransferWriteOp writeOp)
Include the generated interface declarations.
MLIRContext * getContext() const
Definition: Builders.h:54
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types &#39;Ts&#39; to the pattern list with the given arguments...
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:600
AffineMap compose(AffineMap map) const
Returns the AffineMap resulting from composing this with map.
Definition: AffineMap.cpp:439
Operation is a basic unit of execution within MLIR.
Definition: Operation.h:28
void populatePrepareVectorToMMAPatterns(RewritePatternSet &patterns, bool useNvGpu=false)
Patterns to transform vector ops into a canonical form to convert to MMA matrix operations.
bool isParallelIterator(Attribute attr)
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations...
Definition: GPUDialect.h:123
unsigned getNumDims() const
Definition: AffineMap.cpp:294
Attribute getZeroAttr(Type type)
Definition: Builders.cpp:288
operand_range getOperands()
Returns an iterator on the underlying Value&#39;s.
Definition: Operation.h:295
static void convertTransferWriteOp(vector::TransferWriteOp op, llvm::DenseMap< Value, Value > &valueMapping)
Block represents an ordered list of Operations.
Definition: Block.h:29
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:344
void getBackwardSlice(Operation *op, SetVector< Operation *> *backwardSlice, TransitiveFilter filter=nullptr)
Fills backwardSlice with the computed backward slice (i.e.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value...
Definition: LogicalResult.h:72
static AffineMap getPermutationMap(ArrayRef< unsigned > permutation, MLIRContext *context)
Returns an AffineMap representing a permutation.
Definition: AffineMap.cpp:205
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
static scf::ForOp replaceForOpWithNewSignature(OpBuilder &b, scf::ForOp loop, ValueRange newIterOperands)
operand_type_range getOperandTypes()
Definition: Operation.h:314
static llvm::Optional< int64_t > getMemrefConstantHorizontalStride(ShapedType type)
Definition: VectorToGPU.cpp:93
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ValueRange operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
Definition: AffineOps.cpp:798
static LogicalResult convertConstantOpMmaSync(arith::ConstantOp op, llvm::DenseMap< Value, Value > &valueMapping)
Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.
static unsigned perm(const SparseTensorEncodingAttr &enc, unsigned d)
Helper method to apply dimension ordering permutation.
static SetVector< Operation * > getSliceContract(Operation *op, TransitiveFilter backwardFilter, TransitiveFilter forwardFilter)
Return an unsorted slice handling scf.for region differently than getSlice.
bool isMinorIdentity() const
Returns true if this affine map is a minor identity, i.e.
Definition: AffineMap.cpp:109
FailureOr< FragmentElementInfo > getMmaSyncRegisterType(const WarpMatrixInfo &type)
Returns a FragmentElementInfo struct describing the register types for the given matrix fragment type...
static void convertBroadcastOp(vector::BroadcastOp op, llvm::DenseMap< Value, Value > &valueMapping)
Convert a vector.broadcast from scalar to a SubgroupMmaConstantMatrix op.
ArrayAttr getI64ArrayAttr(ArrayRef< int64_t > values)
Definition: Builders.cpp:244
FailureOr< AffineMap > getLaneIdAndValueIdToOperandCoord(Location loc, OpBuilder &builder, const WarpMatrixInfo &fragmentType)
Returns an AffineMap which maps a two dimensions representing (laneId, logicalValueId) and returns tw...
static LogicalResult convertContractOpToMmaSync(vector::ContractionOp op, llvm::DenseMap< Value, Value > &valueMapping)
static ArrayRef< int64_t > getShape(Type type)
Returns the shape of the given type.
Definition: Traits.cpp:117
BlockArgument getArgument(unsigned i)
Definition: Block.h:120
An integer constant appearing in affine expression.
Definition: AffineExpr.h:232
static LogicalResult convertTransferReadToLoads(vector::TransferReadOp op, llvm::DenseMap< Value, Value > &valueMapping)
Converts a vector.transfer_read operation directly to either a vector.load or a nvgpu.ldmatrix operation.
void erase()
Remove this operation from its parent block and delete it.
Definition: Operation.cpp:414
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:48
static void convertElementwiseOp(Operation *op, gpu::MMAElementwiseOp opType, llvm::DenseMap< Value, Value > &valueMapping)
Convert an elementwise op to the equivalent elementwise op on MMA matrix.
AffineExpr getResult(unsigned idx) const
Definition: AffineMap.cpp:311
ArrayAttr getAffineMapArrayAttr(ArrayRef< AffineMap > values)
Definition: Builders.cpp:282
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
std::enable_if< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT >::type walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one)...
Definition: Operation.h:574
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:404
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
LogicalResult getStridesAndOffset(MemRefType t, SmallVectorImpl< int64_t > &strides, int64_t &offset)
Returns the strides of the MemRef if the layout map is in strided form.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
static LogicalResult creatLdMatrixCompatibleLoads(vector::TransferReadOp op, OpBuilder &builder, llvm::DenseMap< Value, Value > &valueMapping)
This class provides support for representing a failure result, or a valid value of type T...
Definition: LogicalResult.h:78
static bool broadcastSupportsMMAMatrixType(vector::BroadcastOp broadcastOp)
Return true if this is a broadcast from scalar to a 2D vector.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
static const char * inferFragType(OpTy op)
SetVector< Operation * > topologicalSort(const SetVector< Operation *> &toSort)
Multi-root DAG topological sort.
static void convertForOp(scf::ForOp op, llvm::DenseMap< Value, Value > &valueMapping)
U dyn_cast() const
Definition: Types.h:270
static Value broadcast(Location loc, Value toBroadcast, unsigned numElements, LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter)
Broadcasts the value to vector with numElements number of elements.
Attributes are known-constant values of operations.
Definition: Attributes.h:24
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:233
void getForwardSlice(Operation *op, SetVector< Operation *> *forwardSlice, TransitiveFilter filter=nullptr)
Fills forwardSlice with the computed forward slice (i.e.
static MMAMatrixType get(ArrayRef< int64_t > shape, Type elementType, StringRef operand)
Get MMAMatrixType and verify construction Invariants.
Definition: GPUDialect.cpp:40
Base type for affine expression.
Definition: AffineExpr.h:68
MLIRContext * getContext() const
Definition: AffineExpr.cpp:23
OpResult getResult(unsigned idx)
Get the &#39;idx&#39;th result of this operation.
Definition: Operation.h:324
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:154
A multi-dimensional affine map Affine map&#39;s are immutable like Type&#39;s, and they are uniqued...
Definition: AffineMap.h:42
This class represents an argument of a Block.
Definition: Value.h:300
static bool constantSupportsMMAMatrixType(arith::ConstantOp constantOp)
Return true if the constant is a splat to a 2D vector so that it can be converted to a MMA constant m...
FailureOr< WarpMatrixInfo > getWarpMatrixInfo(Operation *op)
Given an op that operates on a VectorType representing a warp-level matrix operand, the function returns a struct containing relevant type information.
static bool contractSupportsMMAMatrixType(vector::ContractionOp contract, bool useNvGpu)
Definition: VectorToGPU.cpp:63
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:72
bool isReductionIterator(Attribute attr)
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:85
An attribute that represents a reference to a splat vector or tensor constant, meaning all of the ele...
Specifies information about the registers which compose a matrix fragment according to the PTX docume...
Definition: NvGpuSupport.h:47
static bool transferReadSupportsMMAMatrixType(vector::TransferReadOp readOp, bool useNvGpu)
static void convertTransferReadOp(vector::TransferReadOp op, llvm::DenseMap< Value, Value > &valueMapping)
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:355
FailureOr< AffineMap > getLaneIdToLdMatrixMatrixCoord(Location loc, OpBuilder &builder, const LdMatrixParams &params)
Returns an AffineMap which maps a single dimension representing the laneId to two results representin...
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:294
static SetVector< Operation * > getOpToConvert(mlir::Operation *op, bool useNvGpu)
Type getType() const
Return the type of this value.
Definition: Value.h:118
IndexType getIndexType()
Definition: Builders.cpp:48
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replaces the result op with a new op that is created without verification.
Definition: PatternMatch.h:451
static LogicalResult createNonLdMatrixLoads(vector::TransferReadOp op, OpBuilder &builder, llvm::DenseMap< Value, Value > &valueMapping)
static VectorType vectorType(CodeGen &codegen, Type etp)
Constructs vector type.
static void contract(RootOrderingGraph &graph, ArrayRef< Value > cycle, const DenseMap< Value, unsigned > &parentDepths, DenseMap< Value, Value > &actualSource, DenseMap< Value, Value > &actualTarget)
Contracts the specified cycle in the given graph in-place.
static void convertContractOp(vector::ContractionOp op, llvm::DenseMap< Value, Value > &valueMapping)
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:309
static bool supportsMMaMatrixType(Operation *op, bool useNvGpu)
static llvm::Optional< gpu::MMAElementwiseOp > convertElementwiseOpToMMA(Operation *op)
Return the MMA elementwise enum associated with op if it is supported.
std::unique_ptr< Pass > createConvertVectorToGPUPass(bool useNvGpu=false)
Convert from vector to GPU ops.
static VectorType getMmaSyncVectorOperandType(const nvgpu::FragmentElementInfo &regInfo)
Returns the vector type which represents a matrix fragment.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:328
bool isa() const
Definition: Types.h:254
void convertVectorToMMAOps(Operation *rootOp)
Convert vector ops to MMA matrix operations nested under rootOp.
int64_t inferTileWidthInBits(const WarpMatrixInfo &type)
Returns the number of bits in a single tile row.
static SmallVector< AffineMap, 4 > inferFromExprList(ArrayRef< ArrayRef< AffineExpr >> exprsList)
Returns a vector of AffineMaps; each with as many results as exprs.size(), as many dims as the larges...
Definition: AffineMap.cpp:235
InFlightDiagnostic emitError(const Twine &message={})
Emit an error about fatal conditions with this operation, reporting up to any diagnostic handlers tha...
Definition: Operation.cpp:221
user_range getUsers()
Returns a range of all users.
Definition: Operation.h:650
LogicalResult applyPatternsAndFoldGreedily(MutableArrayRef< Region > regions, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig())
Rewrite the regions of the specified operation, which must be isolated from above, by repeatedly applying the highest benefit patterns in a greedy work-list driven manner.
This class helps build Operations.
Definition: Builders.h:192
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:345
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:95
FailureOr< nvgpu::LdMatrixParams > getLdMatrixParams(const WarpMatrixInfo &type, bool transpose)
static LogicalResult convertTransferWriteToStores(vector::TransferWriteOp op, llvm::DenseMap< Value, Value > &valueMapping)
result_type_range getResultTypes()
Definition: Operation.h:345
static bool elementwiseSupportsMMAMatrixType(Operation *op)
Return true if the op is supported as elementwise op on MMAMatrix type.
LogicalResult convertVectorToNVVMCompatibleMMASync(Operation *rootOp)
Convert vector ops ops nested under rootOp to vector and GPU operaitons compatible with the nvvm...
MLIRContext * getContext() const
unsigned getMemorySpaceAsInt(Attribute memorySpace)
[deprecated] Returns the memory space in old raw integer representation.
static void getXferIndices(OpBuilder &b, TransferOpType xferOp, AffineMap offsetMap, ArrayRef< Value > dimValues, SmallVector< Value, 4 > &indices)
For a vector TransferOpType xferOp, an empty indices vector, and an AffineMap representing offsets to...
Definition: VectorToGPU.cpp:43
static void convertYieldOp(scf::YieldOp op, llvm::DenseMap< Value, Value > &valueMapping)
static void convertConstantOp(arith::ConstantOp op, llvm::DenseMap< Value, Value > &valueMapping)
Convert a 2D splat ConstantOp to a SubgroupMmaConstantMatrix op.