MLIR  19.0.0git
LoopEmitter.cpp
Go to the documentation of this file.
1 //===- LoopEmitter.cpp ----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "LoopEmitter.h"
10 #include "CodegenUtils.h"
11 
21 
22 using namespace mlir;
23 using namespace mlir::sparse_tensor;
24 
25 //===----------------------------------------------------------------------===//
26 // File local shorthand macros
27 //===----------------------------------------------------------------------===//
28 
29 #define CMPI(p, l, r) \
30  (builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, (l), (r)) \
31  .getResult())
32 
33 #define C_IDX(v) (constantIndex(builder, loc, (v)))
34 #define YIELD(vs) (builder.create<scf::YieldOp>(loc, (vs)))
35 #define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, (lhs), (rhs)))
36 #define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
37 #define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
38 #define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
39 #define REMUI(lhs, rhs) (builder.create<arith::RemUIOp>(loc, (lhs), (rhs)))
40 #define DIVUI(lhs, rhs) (builder.create<arith::DivUIOp>(loc, (lhs), (rhs)))
41 #define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))
42 
43 //===----------------------------------------------------------------------===//
44 // Debugging utils
45 //===----------------------------------------------------------------------===//
46 
47 #ifndef NDEBUG
48 LLVM_ATTRIBUTE_UNUSED static void dumpIndexMemRef(OpBuilder &builder,
49  Location loc, Value memref) {
50  memref = builder.create<memref::CastOp>(
51  loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref);
52  createFuncCall(builder, loc, "printMemrefInd", TypeRange{},
54 }
55 #endif
56 
57 //===----------------------------------------------------------------------===//
58 // File local helper functions.
59 //===----------------------------------------------------------------------===//
60 
61 // For index reduction loops, since the tensor are sliced into non-continuous
62 // fragments, we need a triple [pLo, pHi, pPtr], in which the pair (pLo, pHi)
63 // specifies the range of the fragment, and pPtr specifies the index of the
64 // corresponding fragment in the child level (i.e., a pointer to the sliced
65 // position array).
66 static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor,
67  Level lvl) {
68  auto enc = getSparseTensorEncoding(tensor.getType());
69  return createOrFoldSliceOffsetOp(builder, loc, tensor, toDim(enc, lvl));
70 }
71 
72 static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor,
73  Level lvl) {
74  auto enc = getSparseTensorEncoding(tensor.getType());
75  return createOrFoldSliceStrideOp(builder, loc, tensor, toDim(enc, lvl));
76 }
77 
78 //===----------------------------------------------------------------------===//
79 // Sparse tensor loop emitter class implementations
80 //===----------------------------------------------------------------------===//
81 
82 LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput,
83  bool isSparseOut, unsigned numLoops,
84  DependentLvlGetter dimGetter,
85  SparseEmitStrategy emitStrategy) {
86  initialize(tensors, loopTag, hasOutput, isSparseOut, numLoops, dimGetter);
87 }
88 
89 void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
90  bool isSparseOut, unsigned numLoops,
91  DependentLvlGetter dimGetter,
92  SparseEmitStrategy emitStrategy) {
93  // First initialize the top-level type of the fields.
94  this->loopTag = loopTag;
95  this->hasOutput = hasOutput;
96  this->isSparseOut = isSparseOut;
97  this->emitStrategy = emitStrategy;
98 
99  const unsigned numManifestTensors = ts.size();
100  const unsigned synTensorId = numManifestTensors;
101  const unsigned numTensors = numManifestTensors + 1;
102  // tensors array (len == numManifestTensor).
103  this->tensors.assign(ts.begin(), ts.end());
104  // Arrays with len == numTensor.
105  this->valBuffer.assign(numTensors, nullptr);
106  this->lvls.resize(numTensors);
107  this->iters.resize(numTensors);
108 
109  // These zeros will be overwritten below, but we need to initialize
110  // them to something since we'll need random-access assignment.
111  this->loopStack.reserve(numLoops);
112  this->loopSeqStack.reserve(numLoops);
113 
114  // Index-reduction related fields.
115  this->dependentLvlMap.assign(
116  numTensors, std::vector<std::vector<std::pair<TensorLevel, unsigned>>>());
117  this->sliceMeta.assign(
118  numTensors, std::vector<std::vector<std::pair<Value, unsigned>>>());
119  this->levelReducedDep.assign(numTensors, std::vector<unsigned>());
120 
121  // Initialize nested types of `TensorId`-indexed fields.
122  for (TensorId tid = 0; tid < numTensors; tid++) {
123  Level lvlRank;
124  if (tid == synTensorId) {
125  // Synthetic tensor (conceptually) is an all-dense tensor with rank equal
126  // to the total number of loops (each level can potentially be mapped to
127  // one of the loop being generated).
128  lvlRank = numLoops;
129  } else {
130  const Value t = tensors[tid];
131  // a scalar or 0-dimension tensors
133  continue;
134 
135  auto rtp = getRankedTensorType(t);
136  const SparseTensorType stt(rtp);
137  lvlRank = stt.getLvlRank();
138  }
139 
140  lvls[tid].resize(lvlRank);
141  iters[tid].resize(lvlRank);
142  loopHighs.assign(numLoops, nullptr);
143 
144  // Slice-driven loops related initialization.
145  levelReducedDep[tid].assign(lvlRank, 0);
146  dependentLvlMap[tid].assign(
147  lvlRank, std::vector<std::pair<TensorLevel, unsigned>>());
148  sliceMeta[tid].assign(lvlRank, std::vector<std::pair<Value, unsigned>>());
149  if (dimGetter && !isSynTensor(tid)) {
150  for (Level l = 0; l < lvlRank; l++) {
151  std::vector<std::pair<LoopId, unsigned>> deps = dimGetter(tid, l);
152  // Sort the loop by order.
153  std::sort(deps.begin(), deps.end(),
154  [](auto &lhs, auto &rhs) { return lhs.first < rhs.first; });
155 
156  dependentLvlMap[tid][l] = std::move(deps);
157  unsigned depends = dependentLvlMap[tid][l].size();
158  if (depends == 0)
159  continue;
160  sliceMeta[tid][l].reserve(depends);
161  }
162  }
163  }
164 }
165 
166 std::unique_ptr<SparseIterator>
167 LoopEmitter::makeLevelIterator(OpBuilder &builder, Location loc, TensorId t,
168  Level l) {
169  auto it = makeSimpleIterator(*lvls[t][l], emitStrategy);
170  auto stt = getSparseTensorType(tensors[t]);
171  if (stt.hasEncoding() && stt.getEncoding().isSlice()) {
172  Value offset = genSliceOffset(builder, loc, tensors[t], l);
173  Value stride = genSliceStride(builder, loc, tensors[t], l);
174  auto slicedIt = makeSlicedLevelIterator(
175  std::move(it), offset, stride, lvls[t][l]->getSize(), emitStrategy);
176  return slicedIt;
177  }
178  return it;
179 }
180 
182  OpBuilder &builder, Location loc, LoopEmitter::OutputUpdater updater,
184  // For every synthetic tensor, set the high bound by calling the callback.
185  if (synSetter) {
186  TensorId synId = getSynTensorId();
187  for (unsigned i = 0, e = loopHighs.size(); i < e; i++) {
188  Value sz = loopHighs[i] = synSetter(builder, loc, i);
189  auto [stl, it] = makeSynLevelAndIterator(sz, synId, i, emitStrategy);
190  lvls[synId][i] = std::move(stl);
191  iters[synId][i].emplace_back(std::move(it));
192  }
193  }
194 
195  // For every manifest tensor:
196  // * get the values buffer.
197  // * For every level:
198  // * get the positions and coordinates buffers
199  // * get/compute the level-size, which is also used as the upper-bound
200  // on positions.
201  for (TensorId t = 0, numTensors = getNumManifestTensors(); t < numTensors;
202  t++) {
203  const Value tensor = tensors[t];
204  const auto rtp = dyn_cast<RankedTensorType>(tensor.getType());
205  if (!rtp)
206  // Skips only scalar, zero ranked tensor still need to be bufferized and
207  // (probably) filled with zeros by users.
208  continue;
209  // FIXME: the definition of `lvlRank` looks more like a dim-rank;
210  // but the variable is used as a level everywhere below, which
211  // suggests there may be some dim/lvl confusion going on here.
212  auto stt = getSparseTensorType(tensor);
213  const Level lvlRank = stt.getLvlRank();
214  const auto shape = rtp.getShape();
215 
216  SmallVector<Value> lvlSzs;
217  for (Level l = 0; l < stt.getLvlRank(); l++) {
218  if (stt.hasEncoding())
219  lvlSzs.push_back(builder.create<LvlOp>(loc, tensor, l));
220  else
221  lvlSzs.push_back(builder.create<tensor::DimOp>(loc, tensor, l));
222  }
223 
224  // Scan all levels of current tensor.
225  for (Level l = 0; l < lvlRank; l++) {
226  // Find upper bound in current dimension.
227  lvls[t][l] = makeSparseTensorLevel(builder, loc, tensor, t, l);
228  if (!dependentLvlMap[t][l].empty())
229  continue;
230 
231  auto it = makeLevelIterator(builder, loc, t, l);
232  iters[t][l].emplace_back(std::move(it));
233  }
234 
235  // Perform the required bufferization. Dense inputs materialize
236  // from the input tensors. Sparse inputs use sparse primitives to obtain the
237  // values.
238  // Delegates extra output initialization to clients.
239  bool isOutput = isOutputTensor(t);
240  Type elementType = stt.getElementType();
241  if (!stt.hasEncoding()) {
242  // Non-annotated dense tensors.
243  BaseMemRefType denseTp = MemRefType::get(shape, elementType);
244 
245  // TODO: if we unconditionally use fully dynamic layout here, it breaks
246  // some vectorization passes which requires static stride = 1.
247  // Is it possible to call vectorization pass after bufferization?
248  if (llvm::isa_and_nonnull<tensor::ExtractSliceOp>(tensor.getDefiningOp()))
250 
251  Value denseVal =
252  builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
253  // Dense outputs need special handling.
254  if (isOutput && updater)
255  denseVal = updater(builder, loc, denseVal, tensor);
256 
257  valBuffer[t] = denseVal;
258  } else {
259  // Annotated sparse tensors.
260  // We also need the value buffer for all-dense annotated "sparse"
261  // tensors.
262  valBuffer[t] = builder.create<ToValuesOp>(loc, tensor);
263  }
264  // NOTE: we can also prepare for 0 lvl here in advance, this will hoist
265  // some loop preparation from tensor iteration, but will also (undesirably)
266  // hoist the code ouside if-conditions.
267  }
268  // TODO: avoid treating subsection iterator as a special case.
269  initSubSectIterator(builder, loc);
270 }
271 
272 void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) {
273  Value c0 = C_IDX(0);
274  for (TensorId t = 0, e = tensors.size(); t < e; t++) {
275  auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType());
276  if (!rtp)
277  continue;
278 
279  Level lvlRank = SparseTensorType(rtp).getLvlRank();
280 
281  // Compute the dependency reduction order.
282  auto remDepStack = dependentLvlMap;
283  std::vector<std::tuple<LoopId, TensorId, Level>> depRedOrder;
284  for (Level lvl = 0; lvl < lvlRank; lvl++) {
285  // Reverse queue into a stack.
286  std::reverse(remDepStack[t][lvl].begin(), remDepStack[t][lvl].end());
287  for (auto [loop, coeff] : dependentLvlMap[t][lvl])
288  depRedOrder.emplace_back(std::make_tuple(loop, t, lvl));
289  }
290 
291  if (depRedOrder.empty())
292  continue;
293 
294  std::sort(depRedOrder.begin(), depRedOrder.end(),
295  [](auto &l, auto &r) { return std::get<0>(l) < std::get<0>(r); });
296 
297  SmallVector<SparseIterator *> lastIter(tensors.size(), nullptr);
298  for (auto [loop, t, lvl] : depRedOrder) {
299  std::pair<LoopId, unsigned> curDep = remDepStack[t][lvl].back();
300  assert(curDep.first == loop);
301  remDepStack[t][lvl].pop_back();
302 
303  auto lvlIt = makeLevelIterator(builder, loc, t, lvl);
304  const SparseIterator *parent = lastIter[t];
305  if (!parent && lvl > 0) {
306  if (dependentLvlMap[t][lvl - 1].empty()) {
307  parent = iters[t][lvl - 1].back().get();
308  }
309  }
310 
311  std::unique_ptr<SparseIterator> it;
312  if (!remDepStack[t][lvl].empty()) {
313  // Compute the subsection size.
314  Value size = c0;
315  for (auto [loop, stride] : remDepStack[t][lvl]) {
316  Value idxMax = SUBI(loopHighs[loop], C_IDX(1));
317  size = ADDI(size, ADDI(MULI(idxMax, C_IDX(stride)), C_IDX(1)));
318  }
319  it = makeNonEmptySubSectIterator(builder, loc, parent, loopHighs[loop],
320  std::move(lvlIt), size, curDep.second,
321  emitStrategy);
322  } else {
323  const SparseIterator &subSectIter = *iters[t][lvl].back();
324  it = makeTraverseSubSectIterator(builder, loc, subSectIter, *parent,
325  std::move(lvlIt), loopHighs[loop],
326  curDep.second, emitStrategy);
327  }
328  lastIter[t] = it.get();
329  iters[t][lvl].emplace_back(std::move(it));
330  }
331  }
332 }
333 
334 void LoopEmitter::categorizeIterators(
337  // Finds out the tensor level that we should use to generate loops. Amongs all
338  // the tensor levels, there is at most one sparse tensor level.
339  for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
340  SparseIterator *it = &getCurIterator(t, l);
341  if (it->randomAccessible())
342  raIters.push_back(it);
343  else
344  spIters.push_back(it);
345  }
346 
347  std::stable_sort(spIters.begin(), spIters.end(), [](auto lhs, auto rhs) {
348  // AffineUnRed > Affine > Slice > Trivial
349  return static_cast<uint8_t>(lhs->kind) > static_cast<uint8_t>(rhs->kind);
350  });
351 }
352 
354  ArrayRef<TensorLevel> tidLvls) {
355  // TODO: sort
356  assert(loopSeqStack.size() == loopStack.size());
357  // Prepares for all the tensors used in the current loop sequence.
358 
359  for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
360  levelReducedDep[tid][lvl]++;
361  prepareLoopOverTensorAtLvl(builder, loc, tid, lvl);
362  }
363 
364  // Universal Index starts from 0.
365  loopSeqStack.emplace_back(C_IDX(0), tidLvls.vec());
366 }
367 
369  assert(loopSeqStack.size() == loopStack.size() + 1);
370 
371  // Depending on whether the slice is resolved or not at current loop sequence,
372  // end them in different ways.
373  for (auto [tid, lvl] : unpackTensorLevelRange(loopSeqStack.back().second))
374  levelReducedDep[tid][lvl]--;
375 
376  loopSeqStack.pop_back();
377 }
378 
380  switch (a.getKind()) {
381  case AffineExprKind::DimId: {
382  // FIXME: since the one callsite in Sparsification passes in a
383  // level-expression, the `getPosition` must in fact be a `Dimension`.
384  // However, elsewhere we have been lead to expect that `loopIdToOrd`
385  // should be indexed by `LoopId`...
386  const auto loopId = cast<AffineDimExpr>(a).getPosition();
387  return loopStack[loopId].iv;
388  }
389  case AffineExprKind::Add: {
390  auto binOp = cast<AffineBinaryOpExpr>(a);
391  return ADDI(genAffine(builder, loc, binOp.getLHS()),
392  genAffine(builder, loc, binOp.getRHS()));
393  }
394  case AffineExprKind::Mul: {
395  auto binOp = cast<AffineBinaryOpExpr>(a);
396  return MULI(genAffine(builder, loc, binOp.getLHS()),
397  genAffine(builder, loc, binOp.getRHS()));
398  }
400  int64_t c = cast<AffineConstantExpr>(a).getValue();
401  return C_IDX(c);
402  }
403  default:
404  llvm_unreachable("unexpected affine subscript");
405  }
406 }
407 
408 std::pair<Operation *, Value> LoopEmitter::emitForLoopOverTensorAtLvl(
409  OpBuilder &builder, Location loc, SparseIterator &iter,
410  MutableArrayRef<Value> reduc, bool isParallel) {
411 
412  // TODO: support dynamic slices.
413  // Uses the first dimension here to build the loop bound (which is also the
414  // biggest range).
415 
416  Value step = C_IDX(1);
417  auto [lo, hi] = iter.genForCond(builder, loc);
418  Operation *loop = nullptr;
419  Value iv;
420  if (isParallel) {
421  scf::ParallelOp parOp =
422  builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
423  builder.setInsertionPointToStart(parOp.getBody());
424  assert(parOp.getNumReductions() == reduc.size());
425  iv = parOp.getInductionVars()[0];
426 
427  // In-place update on the reduction variable vector.
428  // Note that the init vals is not the actual reduction variables but instead
429  // used as a "special handle" to (temporarily) represent them. The
430  // expression on init vals will be moved into scf.reduce and replaced with
431  // the block arguments when exiting the loop (see exitForLoop). This is
432  // needed as we can not build the actual reduction block and get the actual
433  // reduction variable before users fill parallel loop body.
434  for (int i = 0, e = reduc.size(); i < e; i++)
435  reduc[i] = parOp.getInitVals()[i];
436  loop = parOp;
437  } else {
438  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
439  builder.setInsertionPointToStart(forOp.getBody());
440  iv = forOp.getInductionVar();
441 
442  // In-place update on the reduction variable vector.
443  assert(forOp.getNumRegionIterArgs() == reduc.size());
444  for (int i = 0, e = reduc.size(); i < e; i++)
445  reduc[i] = forOp.getRegionIterArg(i);
446  loop = forOp;
447  }
448  assert(loop && iv);
449 
450  Value crd = iv;
451  if (!iter.randomAccessible()) {
452  iter.linkNewScope(iv);
453  crd = iter.deref(builder, loc);
454  } else {
455  iter.locate(builder, loc, iv);
456  }
457 
458  return {loop, crd};
459 }
460 
461 std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
462  OpBuilder &builder, Location loc, ArrayRef<SparseIterator *> spIters,
463  MutableArrayRef<Value> reduc, bool needsUniv) {
464  // NOTE: the slice driven tensor-related reduction variable must
465  // appear before normal tensors.
466 
467  // The set of induction variables for the while loop.
468  SmallVector<Value> ivs;
469 
470  // Construct the while-loop with a parameter for each coordinate.
471  for (SparseIterator *it : spIters) {
472  ValueRange itVals = it->getCursor();
473  ivs.append(itVals.begin(), itVals.end());
474  }
475 
476  // The position where user-supplied reduction variable starts.
477  ivs.append(reduc.begin(), reduc.end());
478  // Update universal index.
479  if (needsUniv)
480  ivs.push_back(loopSeqStack.back().first);
481 
482  // Ensures all operands are valid.
483  assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; }));
484  TypeRange types = ValueRange(ivs).getTypes();
485  auto whileOp = builder.create<scf::WhileOp>(loc, types, ivs);
486 
487  SmallVector<Location> locs(types.size(), loc);
488  Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
489  Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
490 
491  // Generates loop conditions.
492  builder.setInsertionPointToStart(before);
493  ValueRange bArgs = before->getArguments();
494  Value whileCond = nullptr; // bool values for loop condition.
495 
496  for (SparseIterator *it : spIters) {
497  auto [cond, remArgs] = it->genWhileCond(builder, loc, bArgs);
498  whileCond = !whileCond ? cond : ANDI(whileCond, cond);
499  bArgs = remArgs;
500  }
501  // The remaining block arguments are user-provided reduction values and an
502  // optional universal index. Make sure their sizes match.
503  assert(bArgs.size() == reduc.size() + needsUniv ? 1 : 0);
504  builder.create<scf::ConditionOp>(loc, whileCond, before->getArguments());
505 
506  // Generates loop body.
507  builder.setInsertionPointToStart(after);
508  ValueRange aArgs = after->getArguments();
509  // Since some LoopCondKind might need extra checks to filter out invalid
510  // iterations, we maintains another array to hold the iteration arguments to
511  // yield if the checks fails.
512  SmallVector<Value> nextArgs(aArgs.begin(), aArgs.end());
513 
514  for (SparseIterator *it : spIters) {
515  aArgs = it->linkNewScope(aArgs);
516  // Dereference the iterator to cache the coordinate.
517  it->deref(builder, loc);
518  }
519 
520  // In-place update on reduction variable.
521  assert(aArgs.size() == reduc.size() + needsUniv ? 1 : 0);
522  for (unsigned i = 0, e = reduc.size(); i < e; i++)
523  reduc[i] = aArgs[i];
524 
525  Value min;
526  // Finds the minimum coordinate
527  if (!needsUniv) {
528  for (SparseIterator *it : spIters) {
529  if (min) {
530  Value cmp = CMPI(ult, it->getCrd(), min);
531  min = SELECT(cmp, it->getCrd(), min);
532  } else {
533  min = it->getCrd();
534  }
535  }
536  } else {
537  // Otherwise, universal index is the minimal pos.
538  min = whileOp.getAfterArguments().back();
539  }
540 
541  return {whileOp, min};
542 }
543 
544 bool LoopEmitter::shouldIteratedByForLoop(ArrayRef<SparseIterator *> spIters) {
545  // If we need to co-iterate over two sparse tensors, we need a while loop
546  if (spIters.size() > 1)
547  return false;
548 
549  if (spIters.size() == 1)
550  return spIters.front()->iteratableByFor();
551 
552  return true;
553 }
554 
556  OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
557  MutableArrayRef<Value> reduc, bool tryParallel, bool needsUniv) {
558 
559  // TODO: support multiple return on parallel for?
560  tryParallel = tryParallel && reduc.size() <= 1;
561 
564  categorizeIterators(tidLvls, raIters, spIters);
565 
566  // Only when there is at least one sparse conditions, do we really need the
567  // universal index.
568  // TODO: Maybe we should instead requires merger to pass in a valid value at
569  // the first place instead of adjusting it in LoopEmitter?
570  needsUniv = !spIters.empty() && needsUniv;
571  // The TensorLevel used for loop conditions.
572  // If there is any sparse level, we need to use the sparse condition.
573  // If all levels are dense, we can pick arbitrary one (dense slice-driven loop
574  // can be generated using a simple ForOp as well).
575  Operation *l = nullptr;
576  Value iv = nullptr;
578 
579  // Generates loops differently depending on whether we need a slice-driven
580  // loop or a simple level traversal loop.
581  if (shouldIteratedByForLoop(spIters) && !needsUniv) {
582  assert(spIters.size() <= 1);
583  SparseIterator &it = spIters.empty() ? *raIters.front() : *spIters.front();
584  std::tie(l, iv) =
585  emitForLoopOverTensorAtLvl(builder, loc, it, reduc, tryParallel);
586  tls.push_back(makeTensorLevel(it.tid, it.lvl));
587  } else {
588  for (auto *it : spIters) {
589  tls.push_back(makeTensorLevel(it->tid, it->lvl));
590  }
591 
592  if (needsUniv)
593  for (auto *it : raIters)
594  tls.push_back(makeTensorLevel(it->tid, it->lvl));
595 
596  std::tie(l, iv) =
597  emitWhileLoopOverTensorsAtLvls(builder, loc, spIters, reduc, needsUniv);
598  }
599 
600  // Enter dense tensor levels.
601  for (SparseIterator *it : raIters)
602  it->locate(builder, loc, iv);
603 
604  // NOTE: we can also prepare for next dim here in advance
605  // Pushes the loop into stack.
606  loopStack.emplace_back(tls, l, builder.getInsertionBlock(), iv, loopTag);
607  return l;
608 }
609 
611  TensorLevel tidLvl,
612  AffineExpr lvlExpr) {
613  auto [tid, lvl] = unpackTensorLevel(tidLvl);
614 
615  const SparseIterator *parent =
616  lvl == 0 ? nullptr : iters[tid][lvl - 1].back().get();
617  auto &it = getCurIterator(tid, lvl);
618  it.genInit(builder, loc, parent);
619 
620  assert(it.kind == IterKind::kTrivial && it.randomAccessible());
621  Value lvlCrd = genAffine(builder, loc, lvlExpr);
622  it.locate(builder, loc, lvlCrd);
623 }
624 
625 void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
626  TensorId tid, Level lvl) {
627  // if this is the first level, there is no parent iterator for the current
628  // iterator.
629  // If the current iterator is a subsection-based iterator, the parent iterator
630  // is memorized by the iterator.
631  bool hasParent = lvl == 0 || !dependentLvlMap[tid][lvl].empty();
632 
633  const SparseIterator *parent =
634  hasParent ? nullptr : iters[tid][lvl - 1].back().get();
635  auto &it = getCurIterator(tid, lvl);
636  it.genInit(builder, loc, parent);
637 
638  // Locates the randon accessible iterator to 0.
639  if (it.randomAccessible())
640  it.locate(builder, loc, C_IDX(0));
641 }
642 
643 void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
644  MutableArrayRef<Value> reduc) {
645  const LoopInfo &loopInfo = loopStack.back();
646  if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) {
647  if (!reduc.empty()) {
648  assert(reduc.size() == forOp.getNumResults());
649  rewriter.create<scf::YieldOp>(loc, reduc);
650  }
651  // Exit the loop.
652  rewriter.setInsertionPointAfter(forOp);
653  // In-place update reduction variables.
654  for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
655  reduc[i] = forOp.getResult(i);
656  } else {
657  auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
658  if (!reduc.empty()) {
659  assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
660  Operation *redExp = reduc.front().getDefiningOp();
661  // Reduction expression should have no use.
662  assert(redExp->getUses().empty());
663  // This must be a binary operation.
664  // NOTE: This is users' responsibility to ensure the operation are
665  // commutative.
666  assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
667 
668  Value redVal = parOp.getInitVals().front();
669  Value curVal;
670  if (redExp->getOperand(0) == redVal)
671  curVal = redExp->getOperand(1);
672  else if (redExp->getOperand(1) == redVal)
673  curVal = redExp->getOperand(0);
674  // One of the operands must be the init value (which is also the
675  // previous reduction value).
676  assert(curVal);
677 #ifndef NDEBUG
678  // The reduction expression should be the only user of the reduction val
679  // inside the parallel for.
680  unsigned numUsers = 0;
681  for (Operation *op : redVal.getUsers()) {
682  if (op->getParentOp() == parOp)
683  numUsers++;
684  }
685  assert(numUsers == 1);
686 #endif // NDEBUG
687 
688  rewriter.setInsertionPointAfter(redExp);
689  auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
690  // Attach to the reduction op.
691  Block *redBlock = &redOp.getReductions().front().front();
692  rewriter.setInsertionPointToEnd(redBlock);
693  Operation *newRed = rewriter.clone(*redExp);
694  // Replaces arguments of the reduction expression by using the block
695  // arguments from scf.reduce.
696  rewriter.modifyOpInPlace(
697  newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
698  // Erases the out-dated reduction expression.
699  rewriter.eraseOp(redExp);
700  rewriter.setInsertionPointToEnd(redBlock);
701  rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
702  }
703  rewriter.setInsertionPointAfter(parOp);
704  // In-place update reduction variables.
705  for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
706  reduc[i] = parOp.getResult(i);
707  }
708 }
709 
710 void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc,
711  MutableArrayRef<Value> reduc) {
712  const LoopInfo &loopInfo = loopStack.back();
713  auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop);
714  Value iv = loopInfo.iv;
715  Value one = C_IDX(1);
716 
717  // Finalize the induction. Note that the induction could be performed
718  // in the individual if-branches to avoid re-evaluating the conditions.
719  // However, that would result in a rather elaborate forest of yield
720  // instructions during code generation. Moreover, performing the induction
721  // after the if-statements more closely resembles code generated by TACO.
722  SmallVector<Value> operands;
723  ValueRange whileRes = whileOp.getResults();
724 
725  for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.tidLvls)) {
726  SparseIterator &it = getCurIterator(tid, lvl);
727  if (!it.randomAccessible()) {
728  // Forward the sparse iterator.
729  Value cmp = CMPI(eq, it.getCrd(), iv);
730  it.forwardIf(builder, loc, cmp);
731  operands.append(it.getCursor().begin(), it.getCursor().end());
732  // const Value newPos = whileOp->getResult(o++);
733  // Following loops continue iteration from the break point of the
734  // current while loop.
735  whileRes = it.linkNewScope(whileRes);
736  } else {
737  // Make sure randomly accessible (dense) iterator is set to the right
738  // position according to the universal index.
739  Value uniIdx = whileOp.getResults().back();
740  it.locate(builder, loc, uniIdx);
741  }
742  }
743 
744  // Reduction value from users.
745  for (auto &i : reduc) {
746  operands.push_back(i);
747  // Update user reduction variables.
748  i = whileRes.front();
749  whileRes = whileRes.drop_front();
750  }
751 
752  // An (optional) universal index.
753  if (operands.size() < whileOp.getNumResults()) {
754  assert(operands.size() + 1 == whileOp.getNumResults());
755  // The last one is the universial index.
756  operands.push_back(ADDI(iv, one));
757  // update the loop starting point of current loop sequence
758  loopSeqStack.back().first = whileOp->getResults().back();
759  }
760 
761  if (!operands.empty())
762  YIELD(operands);
763 
764  builder.setInsertionPointAfter(whileOp);
765 }
766 
768  MutableArrayRef<Value> reduc) {
769  // Clean up the values, it would help use to discover potential bug at a
770  // earlier stage (instead of silently using a wrong value).
771  const LoopInfo &loopInfo = loopStack.back();
772 
773  // Sets the insertion point to the right position.
774  rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
775  if (!loopInfo.userCodeBlock->empty() &&
776  llvm::isa<scf::YieldOp>(&loopInfo.userCodeBlock->back())) {
777  // scf::While/For inserts an implicit yield op when there is no loop
778  // iter args. In this case, we need to insert the code before the yield.
779  assert(loopInfo.userCodeBlock->back().getNumResults() == 0);
780  rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back());
781  }
782 
783  if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
784  exitWhileLoop(rewriter, loc, reduc);
785  } else {
786  exitForLoop(rewriter, loc, reduc);
787  }
788 
789  assert(loopStack.size() == loopSeqStack.size());
790  loopStack.pop_back();
791 }
792 
793 #undef CMPI
794 #undef C_IDX
795 #undef YIELD
796 #undef ADDI
797 #undef ANDI
798 #undef SUBI
799 #undef MULI
800 #undef SELECT
static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor, Level lvl)
Definition: LoopEmitter.cpp:72
#define SUBI(lhs, rhs)
Definition: LoopEmitter.cpp:37
#define MULI(lhs, rhs)
Definition: LoopEmitter.cpp:38
static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor, Level lvl)
Definition: LoopEmitter.cpp:66
#define C_IDX(v)
Definition: LoopEmitter.cpp:33
#define ANDI(lhs, rhs)
Definition: LoopEmitter.cpp:36
#define CMPI(p, l, r)
Definition: LoopEmitter.cpp:29
static LLVM_ATTRIBUTE_UNUSED void dumpIndexMemRef(OpBuilder &builder, Location loc, Value memref)
Definition: LoopEmitter.cpp:48
#define YIELD(vs)
Definition: LoopEmitter.cpp:34
#define SELECT(c, l, r)
Definition: LoopEmitter.cpp:41
#define ADDI(lhs, rhs)
Definition: LoopEmitter.cpp:35
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
Base type for affine expression.
Definition: AffineExpr.h:69
AffineExprKind getKind() const
Return the classification for this type.
Definition: AffineExpr.cpp:27
This class provides a shared interface for ranked and unranked memref types.
Definition: BuiltinTypes.h:138
Block represents an ordered list of Operations.
Definition: Block.h:30
BlockArgListType getArguments()
Definition: Block.h:84
Operation & front()
Definition: Block.h:150
IndexType getIndexType()
Definition: Builders.cpp:71
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
This class helps build Operations.
Definition: Builders.h:209
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
Definition: Builders.cpp:553
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:433
void setInsertionPoint(Block *block, Block::iterator insertPoint)
Set the insertion point to the specified location.
Definition: Builders.h:400
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:438
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes=std::nullopt, ArrayRef< Location > locs=std::nullopt)
Add new block with 'argTypes' arguments and set the insertion point to the end of it.
Definition: Builders.cpp:437
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:464
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:414
Block * getInsertionBlock() const
Return the block the current insertion point belongs to.
Definition: Builders.h:444
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
Value getOperand(unsigned idx)
Definition: Operation.h:345
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
Definition: Operation.h:402
unsigned getNumOperands()
Definition: Operation.h:341
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
Definition: Operation.h:234
void setOperands(ValueRange operands)
Replace the current operands of this operation with the ones provided in 'operands'.
Definition: Operation.cpp:237
use_range getUses()
Returns a range of all uses, which is useful for iterating over all uses.
Definition: Operation.h:842
unsigned getNumResults()
Return the number of results held by this operation.
Definition: Operation.h:399
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:400
virtual void eraseOp(Operation *op)
This method erases an operation that is known to have no uses.
void modifyOpInPlace(Operation *root, CallableT &&callable)
This method is a utility wrapper around an in-place modification of an operation.
Definition: PatternMatch.h:630
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:36
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:381
type_range getTypes() const
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
Type getType() const
Return the type of this value.
Definition: Value.h:125
user_range getUsers() const
Definition: Value.h:224
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
void exitCurrentLoop(RewriterBase &rewriter, Location loc, MutableArrayRef< Value > reduc={})
Generates code to exit the current loop (e.g., generates yields, forwards loop induction variables,...
void locateLvlAtAffineAddress(OpBuilder &builder, Location loc, TensorLevel tidLvl, AffineExpr lvlExpr)
Emits the address for a dense level based on the value evaluated by the provided affine expression.
void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef< TensorLevel > tidLvls)
Enters a new loop sequence, the loops within the same sequence starts from the break points of previo...
Value genAffine(OpBuilder &builder, Location loc, AffineExpr a)
Generates code to compute an affine expression whose variables are LoopIds (i.e., a....
Operation * enterCoIterationOverTensorsAtLvls(OpBuilder &builder, Location loc, ArrayRef< TensorLevel > tidLvls, MutableArrayRef< Value > reduc={}, bool isParallel=false, bool needsUniv=false)
Emits a co-iteration loop over a set of tensors.
TensorLevel makeTensorLevel(TensorId t, Level l) const
Compresses a TensorId and Level into a TensorLevel.
Definition: LoopEmitter.h:199
unsigned getNumManifestTensors() const
Gets the total number of manifest tensors (excluding the synthetic tensor).
Definition: LoopEmitter.h:181
void initialize(ValueRange tensors, StringAttr loopTag=nullptr, bool hasOutput=false, bool isSparseOut=false, unsigned numLoops=0, DependentLvlGetter getter=nullptr, SparseEmitStrategy emitStrategy=SparseEmitStrategy::kFunctional)
Takes an array of input tensors, which the generated loops will iterate over.
Definition: LoopEmitter.cpp:89
std::pair< TensorId, Level > unpackTensorLevel(TensorLevel tidLvl) const
De-compresses a TensorLevel back to a pair of TensorId and Level.
Definition: LoopEmitter.h:204
auto unpackTensorLevelRange(ContainerTy &&c) const
Converts a range of TensorLevel to a range of std::pair<TensorId, Level>
Definition: LoopEmitter.h:211
void initializeLoopEmit(OpBuilder &builder, Location loc, OutputUpdater updater=nullptr, SynTensorBoundSetter synSetter=nullptr)
Starts a loop emitting session by generating all the buffers needed for iterating over the tensors.
void exitCurrentLoopSeq(OpBuilder &builder, Location loc)
Exits the current loop sequence, this will reset universal index to 0.
TensorId getSynTensorId() const
Gets the TensorId for synthetic tensor.
Definition: LoopEmitter.h:190
Helper class that generates loop conditions, etc, to traverse a sparse tensor level.
virtual std::pair< Value, Value > genForCond(OpBuilder &b, Location l)
void genInit(OpBuilder &b, Location l, const SparseIterator *p)
void locate(OpBuilder &b, Location l, Value crd)
virtual ValueRange forwardIf(OpBuilder &b, Location l, Value cond)
ValueRange linkNewScope(ValueRange pos)
Value deref(OpBuilder &b, Location l)
virtual bool randomAccessible() const =0
std::pair< Value, ValueRange > genWhileCond(OpBuilder &b, Location l, ValueRange vs)
A wrapper around RankedTensorType, which has three goals:
Level getLvlRank() const
Returns the level-rank.
BaseMemRefType getMemRefTypeWithFullyDynamicLayout(TensorType tensorType, Attribute memorySpace=nullptr)
Return a MemRef type with fully dynamic layout.
Dimension toDim(SparseTensorEncodingAttr enc, Level l)
Convenience method to translate the given level to the corresponding dimension.
unsigned TensorLevel
Definition: LoopEmitter.h:26
std::unique_ptr< SparseIterator > makeTraverseSubSectIterator(OpBuilder &b, Location l, const SparseIterator &subsectIter, const SparseIterator &parent, std::unique_ptr< SparseIterator > &&wrap, Value loopBound, unsigned stride, SparseEmitStrategy strategy)
Helper function to create a SparseIterator object that iterate over a non-empty subsection created by...
uint64_t Level
The type of level identifiers and level-ranks.
Definition: SparseTensor.h:38
RankedTensorType getRankedTensorType(T &&t)
Convenience method to abbreviate casting getType().
Definition: SparseTensor.h:74
std::unique_ptr< SparseIterator > makeSimpleIterator(const SparseTensorLevel &stl, SparseEmitStrategy strategy)
Helper function to create a simple SparseIterator object that iterate over the SparseTensorLevel.
std::pair< std::unique_ptr< SparseTensorLevel >, std::unique_ptr< SparseIterator > > makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl, SparseEmitStrategy strategy)
Helper function to create a synthetic SparseIterator object that iterate over a dense space specified...
Value createOrFoldSliceStrideOp(OpBuilder &builder, Location loc, Value tensor, Dimension dim)
Generates code to retrieve the slice slice for the sparse tensor slice, return a constant if the offs...
SparseTensorEncodingAttr getSparseTensorEncoding(Type type)
Convenience method to get a sparse encoding attribute from a type.
bool isZeroRankedTensorOrScalar(Type type)
Definition: CodegenUtils.h:402
std::unique_ptr< SparseTensorLevel > makeSparseTensorLevel(OpBuilder &builder, Location loc, Value t, unsigned tid, Level l)
Helper function to create a TensorLevel object from given tensor.
SparseTensorType getSparseTensorType(Value val)
Convenience methods to obtain a SparseTensorType from a Value.
func::CallOp createFuncCall(OpBuilder &builder, Location loc, StringRef name, TypeRange resultType, ValueRange operands, EmitCInterface emitCInterface)
Creates a CallOp to the function reference returned by getFunc() in the builder's module.
std::unique_ptr< SparseIterator > makeSlicedLevelIterator(std::unique_ptr< SparseIterator > &&sit, Value offset, Value stride, Value size, SparseEmitStrategy strategy)
Helper function to create a SparseIterator object that iterate over a sliced space,...
Value createOrFoldSliceOffsetOp(OpBuilder &builder, Location loc, Value tensor, Dimension dim)
Generates code to retrieve the slice offset for the sparse tensor slice, return a constant if the off...
std::unique_ptr< SparseIterator > makeNonEmptySubSectIterator(OpBuilder &b, Location l, const SparseIterator *parent, Value loopBound, std::unique_ptr< SparseIterator > &&delegate, Value size, unsigned stride, SparseEmitStrategy strategy)
Helper function to create a SparseIterator object that iterate over the non-empty subsections set.
unsigned TensorId
Tensor identifiers, chosen to be the BlockArgument::getArgNumber of the value passed to Merger::build...
Definition: Merger.h:35
Include the generated interface declarations.
@ Mul
RHS of mul is always a constant or a symbolic expression.
@ DimId
Dimensional identifier.
@ Constant
Constant integer.
SparseEmitStrategy
Defines a scope for reinterpret map pass.
Definition: Passes.h:51
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...