MLIR  19.0.0git
TensorTilingInterfaceImpl.cpp
Go to the documentation of this file.
1 //===- TensorTilingInterface.cpp - Tiling Interface models *- C++ ------*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
21 
22 using namespace mlir;
23 using namespace mlir::tensor;
24 
25 namespace {
26 
27 struct PadOpTiling : public TilingInterface::ExternalModel<PadOpTiling, PadOp> {
28 
29  SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
30  auto padOp = cast<PadOp>(op);
32  padOp.getResultType().getRank(), utils::IteratorType::parallel);
33  return iteratorTypes;
34  }
35 
36  SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
37  ReifiedRankedShapedTypeDims reifiedShapes;
38  (void)reifyResultShapes(b, op, reifiedShapes);
39  OpFoldResult zero = b.getIndexAttr(0);
40  OpFoldResult one = b.getIndexAttr(1);
41  // Initialize all the ranges to {zero, one, one}. All the `ub`s are
42  // overwritten.
43  SmallVector<Range> loopRanges(reifiedShapes[0].size(), {zero, one, one});
44  for (const auto &ub : enumerate(reifiedShapes[0]))
45  loopRanges[ub.index()].size = ub.value();
46  return loopRanges;
47  }
48 
49  FailureOr<TilingResult>
50  getTiledImplementation(Operation *op, OpBuilder &b,
51  ArrayRef<OpFoldResult> offsets,
52  ArrayRef<OpFoldResult> sizes) const {
53  FailureOr<TilingResult> result =
54  tensor::bubbleUpPadSlice(b, cast<PadOp>(op), offsets, sizes);
55  if (failed(result))
56  return failure();
57  return result.value();
58  }
59 
60  LogicalResult
61  getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,
62  ArrayRef<OpFoldResult> offsets,
64  SmallVector<OpFoldResult> &resultOffsets,
65  SmallVector<OpFoldResult> &resultSizes) const {
66  resultOffsets.assign(offsets.begin(), offsets.end());
67  resultSizes.assign(sizes.begin(), sizes.end());
68  return success();
69  }
70 };
71 
72 template <typename OpTy>
73 static SmallVector<Range> getPackUnPackIterationDomain(OpTy op,
74  OpBuilder &builder) {
75  static_assert(llvm::is_one_of<OpTy, PackOp, UnPackOp>::value,
76  "applies to only pack or unpack operations");
77  OpBuilder::InsertionGuard g(builder);
78  int64_t rank = (std::is_same<OpTy, PackOp>::value) ? op.getSourceRank()
79  : op.getDestRank();
80  OpFoldResult zero = builder.getIndexAttr(0);
81  OpFoldResult one = builder.getIndexAttr(1);
82  ReifiedRankedShapedTypeDims resultShape;
83  (void)reifyResultShapes(builder, op, resultShape);
84  SmallVector<Range> loopBounds(rank);
85  for (auto dim : llvm::seq<int64_t>(0, rank)) {
86  loopBounds[dim].offset = zero;
87  loopBounds[dim].stride = one;
88  loopBounds[dim].size = resultShape[0][dim];
89  }
90  return loopBounds;
91 }
92 
93 static void applyPermToRange(SmallVector<OpFoldResult> &offsets,
95  ArrayRef<int64_t> permutation) {
96  if (permutation.empty())
97  return;
98  applyPermutationToVector<OpFoldResult>(offsets, permutation);
99  applyPermutationToVector<OpFoldResult>(sizes, permutation);
100 }
101 
102 struct PackOpTiling
103  : public TilingInterface::ExternalModel<PackOpTiling, PackOp> {
104 
105  SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
106  // Note that here we only consider untiled dimensions and outer tiled data
107  // dimensions, the inner tiled data dimensions are materialized when
108  // building the body of the operation.
109  auto packOp = cast<PackOp>(op);
110  SmallVector<utils::IteratorType> iteratorTypes(
111  packOp.getSourceRank(), utils::IteratorType::parallel);
112  return iteratorTypes;
113  }
114 
115  SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
116  return getPackUnPackIterationDomain<PackOp>(cast<PackOp>(op), b);
117  }
118 
119  FailureOr<TilingResult>
120  getTiledImplementation(Operation *op, OpBuilder &b,
121  ArrayRef<OpFoldResult> offsets,
122  ArrayRef<OpFoldResult> sizes) const {
123  auto packOp = cast<PackOp>(op);
124  Location loc = packOp.getLoc();
125 
126  // The tiling is applied on interchanged dimensions. We have to undo the
127  // interchange to map sizes and offsets to the original input.
128  int64_t inputRank = packOp.getSourceRank();
129  SmallVector<OpFoldResult> origOffsets(offsets.begin(), offsets.end());
130  SmallVector<OpFoldResult> origSizes(sizes.begin(), sizes.end());
131  applyPermToRange(origOffsets, origSizes,
132  invertPermutationVector(packOp.getOuterDimsPerm()));
133 
134  DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
135  packOp.getDimAndTileMapping();
136  SmallVector<OpFoldResult> srcDimValues =
137  tensor::getMixedSizes(b, loc, packOp.getSource());
138  SmallVector<OpFoldResult> inputIndices, inputSizes;
139  for (auto dim : llvm::seq<int64_t>(0, inputRank)) {
140  using AV = affine::AffineValueExpr;
141  affine::AffineBuilder ab(b, loc);
142  AffineExpr dim0, dim1, sym;
143  bindDims(b.getContext(), dim0, dim1);
144  bindSymbols(b.getContext(), sym);
145  if (dimAndTileMapping.count(dim)) {
146  // If the data dimension is tiled, the i-th index is the product of
147  // offset_i and tile_i, and the i-th size is the product of sizes_i and
148  // tile_i.
149  auto avOffset = AV(dim0).bind(origOffsets[dim]);
150  auto avSize = AV(dim0).bind(origSizes[dim]);
151  auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]);
152  inputIndices.push_back(ab.mul(avOffset, avTileSize));
153  inputSizes.push_back(ab.mul(avSize, avTileSize));
154  } else {
155  inputIndices.push_back(origOffsets[dim]);
156  inputSizes.push_back(origSizes[dim]);
157  }
158 
159  // Limit the size of the input operand for incomplete tiles.
160  if (packOp.getPaddingValue()) {
161  OpFoldResult dimSize = srcDimValues[dim];
162  auto avDimSize = AV(dim0).bind(dimSize);
163  auto avInputIdx = AV(dim1).bind(inputIndices.back());
164  inputSizes.back() =
165  ab.min({inputSizes.back(), ab.sub(avDimSize, avInputIdx)});
166  }
167  }
168 
169  auto oneAttr = b.getI64IntegerAttr(1);
170  SmallVector<OpFoldResult> strides(inputRank, oneAttr);
171 
172  SmallVector<Value> tiledOperands;
173  tiledOperands.push_back(b.create<ExtractSliceOp>(
174  loc, packOp.getSource(), inputIndices, inputSizes, strides));
175 
176  SmallVector<OpFoldResult> outputOffsets, outputSizes;
177  if (failed(getResultTilePosition(op, b, 0, offsets, sizes, outputOffsets,
178  outputSizes)))
179  return {};
180 
181  strides.append(packOp.getDestRank() - inputRank, oneAttr);
182  auto extractSlice = b.create<ExtractSliceOp>(
183  loc, packOp.getDest(), outputOffsets, outputSizes, strides);
184  tiledOperands.push_back(extractSlice);
185 
186  if (auto val = packOp.getPaddingValue())
187  tiledOperands.push_back(val);
188  for (auto tile : packOp.getInnerTiles())
189  tiledOperands.push_back(tile);
190 
191  Operation *tiledPackOp = b.create<PackOp>(
192  loc, TypeRange{extractSlice.getType()}, tiledOperands, op->getAttrs());
193 
194  return TilingResult{{tiledPackOp},
195  SmallVector<Value>(tiledPackOp->getResults())};
196  }
197 
198  LogicalResult
199  getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,
200  ArrayRef<OpFoldResult> offsets,
202  SmallVector<OpFoldResult> &resultOffsets,
203  SmallVector<OpFoldResult> &resultSizes) const {
204  // The iteration domain is over outer dimensions of packed layout. In this
205  // context, the outer dimensions of `resultOffsets` are `offsets`. The
206  // inner dimensions of `resultOffsets` are zeros because tiling is not
207  // applied to them.
208  auto packOp = cast<PackOp>(op);
209  int64_t inputRank = packOp.getSourceRank();
210  int64_t outputRank = packOp.getDestRank();
211  auto zeroAttr = b.getI64IntegerAttr(0);
212  resultOffsets.assign(offsets.begin(), offsets.end());
213  resultOffsets.append(outputRank - inputRank, zeroAttr);
214 
215  ReifiedRankedShapedTypeDims outputShape;
216  (void)reifyResultShapes(b, packOp, outputShape);
217  resultSizes.assign(sizes.begin(), sizes.end());
218  for (auto dataTileDim : llvm::seq<unsigned>(inputRank, outputRank))
219  resultSizes.push_back(outputShape[0][dataTileDim]);
220 
221  return success();
222  }
223 
224  FailureOr<TilingResult>
225  generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber,
226  ArrayRef<OpFoldResult> offsets,
227  ArrayRef<OpFoldResult> sizes) const {
228  auto packOp = cast<PackOp>(op);
229  int64_t numTiles = packOp.getInnerDimsPos().size();
230 
231  // tensor.pack op is fusible (as a producer) only if full inner tiles are
232  // iterated or inner dims are not tiled. Otherwise, it will generate a
233  // sequence of non-trivial ops (for partial tiles).
234  for (auto offset : offsets.take_back(numTiles))
235  if (!isConstantIntValue(offset, 0))
236  return failure();
237 
238  for (auto iter :
239  llvm::zip_equal(packOp.getMixedTiles(), sizes.take_back(numTiles)))
240  if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter)))
241  return failure();
242 
243  FailureOr<TilingResult> tilingResult = getTiledImplementation(
244  op, b, offsets.drop_back(numTiles), sizes.drop_back(numTiles));
245  if (failed(tilingResult))
246  return failure();
247  return tilingResult.value();
248  }
249 };
250 
251 struct UnpackTileDimInfo {
252  bool isAlignedToInnerTileSize;
253  OpFoldResult sourceOffset;
254  OpFoldResult sourceSize;
255  OpFoldResult resultOffset;
256  OpFoldResult destExpandedSize;
257 };
258 
259 /// Returns the needed information for tiling unpack op on `tileDim` with given
260 /// `tileOffset` and `tileSize`. For more details, see the comment of the
261 /// `getTiledImplementation`.
262 static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp,
263  int64_t tileDim,
264  OpFoldResult tileOffset,
265  OpFoldResult tileSize) {
266  UnpackTileDimInfo info;
267  Attribute zeroAttr = b.getIndexAttr(0);
268  Attribute oneAttr = b.getIndexAttr(1);
269  DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
270  unpackOp.getDimAndTileMapping();
271  // The dimension is not one of packed data dimension.
272  if (!dimAndTileMapping.count(tileDim)) {
273  info.isAlignedToInnerTileSize = true;
274  info.sourceOffset = tileOffset;
275  info.sourceSize = tileSize;
276  info.resultOffset = zeroAttr;
277  info.destExpandedSize = tileSize;
278  return info;
279  }
280 
281  Location loc = unpackOp.getLoc();
282  using AV = affine::AffineValueExpr;
283  affine::AffineBuilder ab(b, loc);
284  AffineExpr dim0, dim1, sym0;
285  bindDims(b.getContext(), dim0, dim1);
286  bindSymbols(b.getContext(), sym0);
287 
288  OpFoldResult innerTileSize = dimAndTileMapping[tileDim];
289 
290  info.isAlignedToInnerTileSize = false;
291  FailureOr<int64_t> cstSize = ValueBoundsConstraintSet::computeConstantBound(
292  presburger::BoundType::UB, tileSize,
293  /*stopCondition=*/nullptr, /*closedUB=*/true);
294  std::optional<int64_t> cstInnerSize = getConstantIntValue(innerTileSize);
295  if (!failed(cstSize) && cstInnerSize) {
296  if (*cstSize % *cstInnerSize == 0)
297  info.isAlignedToInnerTileSize = true;
298 
299  // If the tiling size equals to the inner tiling size, the outer dims are
300  // always 1.
301  if (*cstInnerSize == *cstSize) {
302  auto lhs = AV(dim0).bind(tileOffset);
303  auto rhs = AV(dim1).bind(innerTileSize);
304  info.sourceOffset = ab.floor(lhs, rhs);
305  info.sourceSize = oneAttr;
306  info.resultOffset = zeroAttr;
307  info.destExpandedSize = tileSize;
308  return info;
309  }
310  }
311 
312  if (info.isAlignedToInnerTileSize) {
313  info.sourceOffset =
314  ab.floor(AV(dim0).bind(tileOffset), AV(dim1).bind(innerTileSize));
315  info.resultOffset = zeroAttr;
316  info.destExpandedSize = tileSize;
317 
318  // The ceilDiv is needed here because there could be incomplete tile even
319  // it is perfect tiling cases. E.g.,
320  // %0 = unpack tensor<33x2xf32> into tensor<64xf32>
321  // If the tiling size is 32, there will be 3 tiles. Two of them have
322  // size=32; one of them have size=2. The size is represented using
323  // affine_min op; we need ceilDiv.
324  info.sourceSize =
325  ab.ceil(AV(dim0).bind(tileSize), AV(dim1).bind(innerTileSize));
326  return info;
327  }
328 
330  b, loc, getValueOrCreateConstantIndexOp(b, loc, tileOffset),
331  getValueOrCreateConstantIndexOp(b, loc, innerTileSize));
332  OpFoldResult tileExclusiveBound =
333  ab.add(AV(dim0).bind(tileOffset), AV(dim1).bind(tileSize));
335  b, loc,
337  b, loc,
338  ab.sub(AV(dim0).bind(tileExclusiveBound), AV(dim1).bind(oneAttr))),
339  getValueOrCreateConstantIndexOp(b, loc, innerTileSize));
340 
341  OpFoldResult lengthMinusOne = ab.sub(AV(dim0).bind(lastCoord.quotient),
342  AV(dim1).bind(firstCoord.quotient));
343  info.sourceSize =
344  ab.add(AV(dim0).bind(lengthMinusOne), AV(dim1).bind(oneAttr));
345  info.sourceOffset = firstCoord.quotient;
346  info.resultOffset = firstCoord.remainder;
347  // Do not create an Affine ops for expanded size because the affine op is too
348  // complicated which would trigger an issue in affine ops simplification.
349  info.destExpandedSize = b.createOrFold<arith::MulIOp>(
350  loc, getValueOrCreateConstantIndexOp(b, loc, info.sourceSize),
351  getValueOrCreateConstantIndexOp(b, loc, innerTileSize));
352  return info;
353 }
354 
355 struct UnPackOpTiling
356  : public TilingInterface::ExternalModel<UnPackOpTiling, UnPackOp> {
357 
358  SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
359  auto unpackOp = cast<UnPackOp>(op);
360  SmallVector<utils::IteratorType> iteratorTypes(
361  unpackOp.getDestRank(), utils::IteratorType::parallel);
362  return iteratorTypes;
363  }
364 
365  SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
366  return getPackUnPackIterationDomain<UnPackOp>(cast<UnPackOp>(op), b);
367  }
368 
369  /// There are two cases in tiling unpack ops. If the tiling size is aligned to
370  /// the inner tile size, the corresponding tiles of source are all complete.
371  /// Otherwise, there are in-complete tiles. We will need to expand the slice
372  /// of source for getting complete tiles. The tiled unpack op unpacks more
373  /// data from source, so We'll need an extract_slice op to shift and truncate
374  /// the output.
375  /// Take Nn_to_N as an example. Say that N=32, n=8, and tiling_size=15. The
376  /// coordinates of second tile (i.e., result[15..31]) are
377  /// [(1, 7), (2, 0,), (2, 1) ... (3, 6), (3, 7)]. The first row and the last
378  /// row are incomplete tiles. To represent the unpack op, we have to complete
379  /// the rows. I.e., the input coordinates would start with (1, 0); end with
380  /// (3, 7). In this context, the tiled unpack produces a (3 * n) elements
381  /// because there are 3 rows in total. Follow by a tensor.extract_slice op, we
382  /// can get the actual result.
383  FailureOr<TilingResult>
384  getTiledImplementation(Operation *op, OpBuilder &b,
385  ArrayRef<OpFoldResult> offsets,
386  ArrayRef<OpFoldResult> sizes) const {
387  auto unpackOp = cast<UnPackOp>(op);
388  int64_t srcRank = unpackOp.getSourceRank();
389  int64_t destRank = unpackOp.getDestRank();
390  int64_t numInnerTiles = srcRank - destRank;
391  Location loc = unpackOp.getLoc();
392 
393  // The perfect tiling case indicates that the tiling sizes are multiple of
394  // inner_tile_size. In this context, no extra data is needed when
395  // representing the tiled unpack op.
396  bool isPerfectTilingCase = true;
397  Attribute oneAttr = b.getIndexAttr(1);
398  SmallVector<OpFoldResult> sliceSrcStrides(destRank, oneAttr);
399  SmallVector<OpFoldResult> sliceSrcIndices, sliceSrcSizes;
400  SmallVector<OpFoldResult> destExpandedSizes, resultOffsetsFromDest;
401  for (auto dim : llvm::seq<int64_t>(0, destRank)) {
402  UnpackTileDimInfo info =
403  getUnpackTileDimInfo(b, unpackOp, dim, offsets[dim], sizes[dim]);
404  if (!info.isAlignedToInnerTileSize)
405  isPerfectTilingCase = false;
406  sliceSrcIndices.push_back(info.sourceOffset);
407  sliceSrcSizes.push_back(info.sourceSize);
408  destExpandedSizes.push_back(info.destExpandedSize);
409  resultOffsetsFromDest.push_back(info.resultOffset);
410  }
411 
412  // The tiling is applied on destination dimensions. We have to apply the
413  // interchange on source dimensions if outer_dims_perm is set.
414  applyPermToRange(sliceSrcIndices, sliceSrcSizes,
415  unpackOp.getOuterDimsPerm());
416  Attribute zeroAttr = b.getIndexAttr(0);
417  sliceSrcIndices.append(numInnerTiles, zeroAttr);
418  sliceSrcSizes.append(unpackOp.getMixedTiles());
419  sliceSrcStrides.append(numInnerTiles, oneAttr);
420  Value sliceSource =
421  b.create<ExtractSliceOp>(loc, unpackOp.getSource(), sliceSrcIndices,
422  sliceSrcSizes, sliceSrcStrides);
423 
424  SmallVector<OpFoldResult> destStrides(destRank, oneAttr);
425  Value sliceDest;
426  if (isPerfectTilingCase) {
427  sliceDest = b.create<ExtractSliceOp>(loc, unpackOp.getDest(), offsets,
428  sizes, destStrides);
429  } else {
430  sliceDest = b.create<EmptyOp>(loc, destExpandedSizes,
431  unpackOp.getDestType().getElementType());
432  }
433 
434  SmallVector<Value> tiledOperands = {sliceSource, sliceDest};
435  for (auto tile : unpackOp.getInnerTiles())
436  tiledOperands.push_back(tile);
437 
438  Operation *tiledUnpackOp = b.create<UnPackOp>(
439  loc, TypeRange{sliceDest.getType()}, tiledOperands, op->getAttrs());
440 
441  if (isPerfectTilingCase)
442  return TilingResult{{tiledUnpackOp},
443  SmallVector<Value>(tiledUnpackOp->getResults())};
444 
445  auto extractSlice =
446  b.create<ExtractSliceOp>(loc, tiledUnpackOp->getResult(0),
447  resultOffsetsFromDest, sizes, destStrides);
448  return TilingResult{{tiledUnpackOp}, {extractSlice.getResult()}};
449  }
450 
451  LogicalResult
452  getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,
453  ArrayRef<OpFoldResult> offsets,
455  SmallVector<OpFoldResult> &resultOffsets,
456  SmallVector<OpFoldResult> &resultSizes) const {
457  resultOffsets = llvm::to_vector(offsets);
458  resultSizes = llvm::to_vector(sizes);
459  return success();
460  }
461 
462  FailureOr<TilingResult>
463  generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber,
464  ArrayRef<OpFoldResult> offsets,
465  ArrayRef<OpFoldResult> sizes) const {
466  FailureOr<TilingResult> tilingResult =
467  getTiledImplementation(op, b, offsets, sizes);
468  if (failed(tilingResult))
469  return failure();
470  return tilingResult.value();
471  }
472 
473  /// Method to return the position of iteration domain tile computed by the
474  /// tiled operation.
475  LogicalResult getIterationDomainTileFromOperandTile(
476  Operation *op, OpBuilder &b, unsigned operandNumber,
478  SmallVectorImpl<OpFoldResult> &resultOffsets,
479  SmallVectorImpl<OpFoldResult> &resultSizes) const {
480  auto unPackOp = cast<UnPackOp>(op);
481  Location loc = unPackOp.getLoc();
482 
483  int64_t numTiles = unPackOp.getInnerDimsPos().size();
484  auto destOffsets = offsets.drop_back(numTiles);
485  auto destSizes = sizes.drop_back(numTiles);
486  // The tiling is applied on interchanged dimensions. We have to undo the
487  // interchange to map sizes and offsets to the original input.
488  int64_t outputRank = unPackOp.getDestRank();
489  SmallVector<OpFoldResult> origOffsets(destOffsets.begin(),
490  destOffsets.end());
491  SmallVector<OpFoldResult> origSizes(destSizes.begin(), destSizes.end());
492  applyPermToRange(origOffsets, origSizes,
493  invertPermutationVector(unPackOp.getOuterDimsPerm()));
494 
495  DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
496  unPackOp.getDimAndTileMapping();
497 
498  for (auto dim : llvm::seq<int64_t>(0, outputRank)) {
499  using AV = affine::AffineValueExpr;
500  affine::AffineBuilder ab(b, loc);
501  AffineExpr dim0, dim1, sym;
502  bindDims(b.getContext(), dim0, dim1);
503  bindSymbols(b.getContext(), sym);
504  if (dimAndTileMapping.count(dim)) {
505  // If the data dimension is tiled, the i-th index is the product of
506  // offset_i and tile_i, and the i-th size is the product of sizes_i and
507  // tile_i.
508  auto avOffset = AV(dim0).bind(origOffsets[dim]);
509  auto avSize = AV(dim0).bind(origSizes[dim]);
510  auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]);
511  resultOffsets.push_back(ab.mul(avOffset, avTileSize));
512  resultSizes.push_back(ab.mul(avSize, avTileSize));
513  } else {
514  resultOffsets.push_back(origOffsets[dim]);
515  resultSizes.push_back(origSizes[dim]);
516  }
517  }
518  return success();
519  }
520 
521  /// Method to return the tiled implementation of tensor.unpack as a consumer.
522  FailureOr<TilingResult> getTiledImplementationFromOperandTile(
523  Operation *op, OpBuilder &b, unsigned operandNumber,
524  ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes) const {
525  auto unPackOp = cast<UnPackOp>(op);
526  // tensor.unpack op is fusible (as a consumer) only if inner dims are not
527  // tiled.
528  int64_t numTiles = unPackOp.getInnerDimsPos().size();
529  for (auto iter :
530  llvm::zip_equal(unPackOp.getMixedTiles(), sizes.take_back(numTiles))) {
531  if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter)))
532  return failure();
533  }
534 
535  Location loc = unPackOp.getLoc();
536 
537  // Fetch offset/size for creating the slice of the dest operand of
538  // unpack op.
539  SmallVector<OpFoldResult> outputOffsets, outputSizes;
540  if (failed(getIterationDomainTileFromOperandTile(
541  op, b, /*operandNumber=*/0, offsets, sizes, outputOffsets,
542  outputSizes)))
543  return failure();
544 
545  auto oneAttr = b.getI64IntegerAttr(1);
546  int64_t outputRank = unPackOp.getDestRank();
547  SmallVector<OpFoldResult> strides(outputRank, oneAttr);
548 
549  SmallVector<Value> tiledOperands;
550  // Create slice of the dest operand.
551  auto extractDestSlice = b.create<ExtractSliceOp>(
552  loc, unPackOp.getDest(), outputOffsets, outputSizes, strides);
553  tiledOperands.push_back(extractDestSlice);
554 
555  SmallVector<OpFoldResult> inputOffsets, inputSizes;
556  strides.append(unPackOp.getSourceRank() - outputRank, oneAttr);
557  // Create slice of the source operand.
558  auto extractSourceSlice = b.create<ExtractSliceOp>(
559  loc, unPackOp.getSource(), offsets, sizes, strides);
560  tiledOperands.insert(tiledOperands.begin(), extractSourceSlice);
561  for (auto tile : unPackOp.getInnerTiles())
562  tiledOperands.push_back(tile);
563 
564  // Create tiled unpack op.
565  Operation *tiledUnPackOp =
566  b.create<UnPackOp>(loc, TypeRange{extractDestSlice.getType()},
567  tiledOperands, op->getAttrs());
568 
569  return TilingResult{{tiledUnPackOp},
570  SmallVector<Value>(tiledUnPackOp->getResults())};
571  }
572 };
573 
574 } // namespace
575 
576 FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
577  tensor::PadOp padOp,
578  ArrayRef<OpFoldResult> offsets,
580  bool generateZeroSliceGuard) {
581  // Only constant padding value supported.
582  Value padValue = padOp.getConstantPaddingValue();
583  if (!padValue)
584  return failure();
585 
586  // Helper variables and functions for various arithmetic operations. These
587  // are used extensively for computing new offset/length and padding values.
588  Location loc = padOp->getLoc();
589  AffineExpr dim0, dim1;
590  bindDims(b.getContext(), dim0, dim1);
591  // Add two integers.
592  auto addMap = AffineMap::get(2, 0, {dim0 + dim1});
593  auto add = [&](OpFoldResult v1, OpFoldResult v2) {
594  return affine::makeComposedFoldedAffineApply(b, loc, addMap, {v1, v2});
595  };
596  // Subtract two integers.
597  auto subMap = AffineMap::get(2, 0, {dim0 - dim1});
598  auto sub = [&](OpFoldResult v1, OpFoldResult v2) {
599  return affine::makeComposedFoldedAffineApply(b, loc, subMap, {v1, v2});
600  };
601  // Take the minimum of two integers.
602  auto idMap = AffineMap::getMultiDimIdentityMap(2, b.getContext());
603  auto min = [&](OpFoldResult v1, OpFoldResult v2) {
604  return affine::makeComposedFoldedAffineMin(b, loc, idMap, {v1, v2});
605  };
606  // Take the maximum of two integers.
607  auto max = [&](OpFoldResult v1, OpFoldResult v2) {
608  return affine::makeComposedFoldedAffineMax(b, loc, idMap, {v1, v2});
609  };
610  // Zero index-typed integer.
611  OpFoldResult zero = b.getIndexAttr(0);
612 
613  // Compute new offsets, lengths, low padding, high padding.
614  SmallVector<OpFoldResult> newOffsets, newLengths, newStrides;
615  SmallVector<OpFoldResult> newLows, newHighs;
616  // Set to true if the original data source is not read at all.
617  bool hasZeroLen = false;
618  // Same as hasZeroLen, but for dynamic dimension sizes. This condition
619  // is true if the original data source turns out to be unused at runtime.
620  Value dynHasZeroLenCond;
621 
622  int64_t rank = padOp.getSourceType().getRank();
623  for (unsigned dim = 0; dim < rank; ++dim) {
624  auto low = padOp.getMixedLowPad()[dim];
625  bool hasLowPad = !isConstantIntValue(low, 0);
626  auto high = padOp.getMixedHighPad()[dim];
627  bool hasHighPad = !isConstantIntValue(high, 0);
628  auto offset = offsets[dim];
629  auto length = sizes[dim];
630  auto srcSize = tensor::getMixedSize(b, loc, padOp.getSource(), dim);
631 
632  // The new amount of low padding is `low - offset`. Except for the case
633  // where none of the low padding is read. In that case, the new amount of
634  // low padding is zero.
635  //
636  // Optimization: If low = 0, then newLow = 0.
637  OpFoldResult newLow = hasLowPad ? max(zero, sub(low, offset)) : zero;
638  newLows.push_back(newLow);
639 
640  // Start reading the data from position `offset - low`. Since the original
641  // read may have started in the low padding zone, this value could be
642  // negative. Therefore, start reading from:
643  //
644  // max(offset - low, 0)
645  //
646  // The original read could also have started in the high padding zone.
647  // In that case, set the offset to the end of source tensor. The new
648  // ExtractSliceOp length will be zero in that case. (Effectively reading
649  // no data from the source.)
650  //
651  // Optimization: If low = 0, then the formula can be simplified.
652  OpFoldResult newOffset = hasLowPad
653  ? min(max(sub(offset, low), zero), srcSize)
654  : min(offset, srcSize);
655  newOffsets.push_back(newOffset);
656 
657  // The original ExtractSliceOp was reading until position `offset +
658  // length`. Therefore, the corresponding position within the source tensor
659  // is:
660  //
661  // offset + length - low
662  //
663  // In case the original ExtractSliceOp stopped reading within the low
664  // padding zone, this value can be negative. In that case, the end
665  // position of the read should be zero. (Similar to newOffset.)
666  //
667  // The original read could also have stopped in the high padding zone.
668  // In that case, set the end positition of the read should be the end of
669  // the source tensor. (Similar to newOffset.)
670  //
671  // endLoc = min(max(offset - low + length, 0), srcSize)
672  //
673  // The new ExtractSliceOp length is `endLoc - newOffset`.
674  //
675  // Optimization: If low = 0, then the formula can be simplified.
676  OpFoldResult endLoc =
677  hasLowPad ? min(max(add(sub(offset, low), length), zero), srcSize)
678  : min(add(offset, length), srcSize);
679  OpFoldResult newLength = sub(endLoc, newOffset);
680  newLengths.push_back(newLength);
681 
682  // Check if newLength is zero. In that case, no SubTensorOp should be
683  // executed.
684  if (isConstantIntValue(newLength, 0)) {
685  hasZeroLen = true;
686  } else if (!hasZeroLen) {
687  Value check = b.create<arith::CmpIOp>(
688  loc, arith::CmpIPredicate::eq,
689  getValueOrCreateConstantIndexOp(b, loc, newLength),
690  getValueOrCreateConstantIndexOp(b, loc, zero));
691  dynHasZeroLenCond =
692  dynHasZeroLenCond
693  ? b.create<arith::OrIOp>(loc, check, dynHasZeroLenCond)
694  : check;
695  }
696 
697  // The amount of high padding is simply the number of elements remaining,
698  // so that the result has the same length as the original ExtractSliceOp.
699  // As an optimization, if the original high padding is zero, then the new
700  // high padding must also be zero.
701  OpFoldResult newHigh =
702  hasHighPad ? sub(sub(length, newLength), newLow) : zero;
703  newHighs.push_back(newHigh);
704 
705  // Only unit stride supported.
706  newStrides.push_back(b.getIndexAttr(1));
707  }
708 
709  // The shape of the result can be obtained from the sizes passed in.
710  SmallVector<Value> dynDims;
711  SmallVector<int64_t> shape;
712  dispatchIndexOpFoldResults(sizes, dynDims, shape);
713  RankedTensorType resultType =
714  RankedTensorType::get(shape, padOp.getResultType().getElementType());
715 
716  // Insert cast to ensure that types match. (May be folded away.)
717  auto castResult = [&](Value val) -> Value {
718  if (resultType == val.getType())
719  return val;
720  return b.create<tensor::CastOp>(loc, resultType, val);
721  };
722 
723  // In cases where the original data source is unused: Emit a GenerateOp and
724  // do not generate a SliceOp. (The result shape of the SliceOp would
725  // have a dimension of size 0, the semantics of which is unclear.)
726  auto createGenerateOp = [&]() {
727  // Create GenerateOp.
728  auto generateOp = b.create<tensor::GenerateOp>(
729  loc, resultType, dynDims,
730  [&](OpBuilder &builder, Location gLoc, ValueRange indices) {
731  builder.create<tensor::YieldOp>(gLoc, padValue);
732  });
733  return generateOp;
734  };
735 
736  // Emit a SliceOp and a PadOp. Should not be used in cases where
737  // the result shape of the new SliceOp has a zero dimension.
738  auto createPadOfExtractSlice = [&]() {
739  // Create pad(extract_slice(x)).
740  Value newSliceOp = b.create<tensor::ExtractSliceOp>(
741  loc, padOp.getSource(), newOffsets, newLengths, newStrides);
742  auto newPadOp = b.create<PadOp>(
743  loc, Type(), newSliceOp, newLows, newHighs,
744  /*nofold=*/padOp.getNofold(),
745  getPrunedAttributeList(padOp, PadOp::getAttributeNames()));
746 
747  // Copy region to new PadOp.
748  IRMapping bvm;
749  padOp.getRegion().cloneInto(&newPadOp.getRegion(), bvm);
750 
751  // Cast result and return.
752  return newPadOp;
753  };
754 
755  // Rewrite extract_slice(pad(x)) into a GenerateOp it is statically known that
756  // the original data source x is not used.
757  if (hasZeroLen) {
758  Operation *generateOp = createGenerateOp();
759  return TilingResult{{generateOp}, {castResult(generateOp->getResult(0))}};
760  }
761 
762  // If there are dynamic dimensions: Generate an scf.if check to avoid
763  // creating SliceOps with result dimensions of size 0 at runtime.
764  if (generateZeroSliceGuard && dynHasZeroLenCond) {
765  Operation *thenOp;
766  Operation *elseOp;
767  auto result = b.create<scf::IfOp>(
768  loc, dynHasZeroLenCond,
769  /*thenBuilder=*/
770  [&](OpBuilder &b, Location loc) {
771  thenOp = createGenerateOp();
772  b.create<scf::YieldOp>(loc, castResult(thenOp->getResult(0)));
773  },
774  /*elseBuilder=*/
775  [&](OpBuilder &b, Location loc) {
776  elseOp = createPadOfExtractSlice();
777  b.create<scf::YieldOp>(loc, castResult(elseOp->getResult(0)));
778  });
779  return TilingResult{{elseOp}, SmallVector<Value>(result->getResults())};
780  }
781 
782  Operation *newPadOp = createPadOfExtractSlice();
783  return TilingResult{{newPadOp}, {castResult(newPadOp->getResult(0))}};
784 }
785 
787  DialectRegistry &registry) {
788  registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) {
789  tensor::PadOp::attachInterface<PadOpTiling>(*ctx);
790  tensor::PackOp::attachInterface<PackOpTiling>(*ctx);
791  tensor::UnPackOp::attachInterface<UnPackOpTiling>(*ctx);
792  });
793 }
794 
796  DialectRegistry &registry) {
797  registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) {
798  tensor::PackOp::attachInterface<PackOpTiling>(*ctx);
799  tensor::UnPackOp::attachInterface<UnPackOpTiling>(*ctx);
800  });
801 }
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:68
static AffineMap getMultiDimIdentityMap(unsigned numDims, MLIRContext *context)
Returns an AffineMap with 'numDims' identity result dim exprs.
Definition: AffineMap.cpp:321
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Attributes are known-constant values of operations.
Definition: Attributes.h:25
IntegerAttr getIndexAttr(int64_t value)
Definition: Builders.cpp:124
IntegerAttr getI64IntegerAttr(int64_t value)
Definition: Builders.cpp:128
MLIRContext * getContext() const
Definition: Builders.h:55
The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.
void addExtension(std::unique_ptr< DialectExtensionBase > extension)
Add the given extension to the registry.
This is a utility class for mapping one set of IR entities to another.
Definition: IRMapping.h:26
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
RAII guard to reset the insertion point of the builder when destroyed.
Definition: Builders.h:350
This class helps build Operations.
Definition: Builders.h:209
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:522
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:464
This class represents a single result from folding an operation.
Definition: OpDefinition.h:268
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:402
ArrayRef< NamedAttribute > getAttrs()
Return all of the attributes on this operation.
Definition: Operation.h:507
result_range getResults()
Definition: Operation.h:410
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:36
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
static FailureOr< int64_t > computeConstantBound(presburger::BoundType type, const Variable &var, StopConditionFn stopCondition=nullptr, bool closedUB=false)
Compute a constant bound for the given variable.
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:129
OpFoldResult makeComposedFoldedAffineMax(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a maximum across the results of applying map to operands,...
Definition: AffineOps.cpp:1305
OpFoldResult makeComposedFoldedAffineMin(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineMinOp that computes a minimum across the results of applying map to operands,...
Definition: AffineOps.cpp:1298
OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...
Definition: AffineOps.cpp:1192
DivModValue getDivMod(OpBuilder &b, Location loc, Value lhs, Value rhs)
Create IR to calculate (div lhs, rhs) and (mod lhs, rhs).
Definition: Utils.cpp:1827
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Definition: Matchers.h:285
FailureOr< TilingResult > bubbleUpPadSlice(OpBuilder &b, tensor::PadOp padOp, ArrayRef< OpFoldResult > offsets, ArrayRef< OpFoldResult > sizes, bool generateZeroSliceGuard=true)
Bubbles up a slice of this pad by taking the slice first and then performing the padding.
void registerTilingInterfaceExternalModels(mlir::DialectRegistry &registry)
Registers external models for Tiling interface for tensor ops.
OpFoldResult getMixedSize(OpBuilder &builder, Location loc, Value value, int64_t dim)
Return the dimension of the given tensor value.
Definition: TensorOps.cpp:55
SmallVector< OpFoldResult > getMixedSizes(OpBuilder &builder, Location loc, Value value)
Return the dimensions of the given tensor value.
Definition: TensorOps.cpp:65
void registerTilingInterfaceExternalModelsForPackUnPackOps(DialectRegistry &registry)
Similar to the above registeration, but it is only for tensor.pack and tensor.unpack ops.
Include the generated interface declarations.
bool isConstantIntValue(OpFoldResult ofr, int64_t value)
Return true if ofr is constant integer equal to value.
std::optional< int64_t > getConstantIntValue(OpFoldResult ofr)
If ofr is a constant integer or an IntegerAttr, return the integer.
LogicalResult reifyResultShapes(OpBuilder &b, Operation *op, ReifiedRankedShapedTypeDims &reifiedReturnShapes)
Reify the shape of the result of an operation (typically in terms of the shape of its operands).
bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2)
Return true if ofr1 and ofr2 are the same integer constant attribute values or the same SSA value.
void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to DimExpr at positions: [0 .
Definition: AffineExpr.h:348
void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)
Bind a list of AffineExpr references to SymbolExpr at positions: [0 .
Definition: AffineExpr.h:362
void dispatchIndexOpFoldResults(ArrayRef< OpFoldResult > ofrs, SmallVectorImpl< Value > &dynamicVec, SmallVectorImpl< int64_t > &staticVec)
Helper function to dispatch multiple OpFoldResults according to the behavior of dispatchIndexOpFoldRe...
Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)
Converts an OpFoldResult to a Value.
Definition: Utils.cpp:112
SmallVector< Loops, 8 > tile(ArrayRef< scf::ForOp > forOps, ArrayRef< Value > sizes, ArrayRef< scf::ForOp > targets)
Performs tiling fo imperfectly nested loops (with interchange) by strip-mining the forOps by sizes an...
Definition: Utils.cpp:1183
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
SmallVector< NamedAttribute > getPrunedAttributeList(Operation *op, ArrayRef< StringRef > elidedAttrs)
SmallVector< int64_t > invertPermutationVector(ArrayRef< int64_t > permutation)
Helper method to apply to inverse a permutation.
Container for result values of tiling.
Helper struct to build simple AffineValueExprs with minimal type inference support.
Definition: Utils.h:349
Holds the result of (div a, b) and (mod a, b).
Definition: Utils.h:301