MLIR  18.0.0git
LoopEmitter.h
Go to the documentation of this file.
1 //===- LoopEmitter.h --------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_
10 #define MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_
11 
12 #include <vector>
13 
17 #include "mlir/IR/PatternMatch.h"
18 
19 namespace mlir {
20 namespace sparse_tensor {
21 
22 //===----------------------------------------------------------------------===//
23 /// The position of a loop in the loop-stack, or the position of a
24 /// `LoopId` in a topologically-sorted list of `LoopId`s.
25 ///
26 /// Although this type may have the same cardinality as `LoopId`, it must
27 /// not be confused with that type. The `LoopId` type is used by the `Merger`
28 /// as a unique identifier for loop-variables, regardless of the ordering
29 /// of those loops. Whereas the `LoopOrd` type is used by the `LoopEmitter`
30 /// (and `CodegenEnv`) to refer to the actual order in which loops are
31 /// generated.
32 ///
33 /// TODO: further explicate the correspondences between these various
34 /// types. In particular, since the `$dim` argument to `linalg::IndexOp`
35 /// is a De Bruijn index, it seems like that should correspond to `LoopOrd`,
36 /// and yet the `Merger` has that correspond with `LoopId` instead.
37 /// In addition `LoopEmitter::genAffine` has `AffineDimExpr::position`
38 /// correspond to `LoopId`, however it is unclear what the providence
39 /// of those `AffineDimExpr` is.
40 //
41 // TODO: use a struct/class rather than a typedef, so that we can actually
42 // typecheck this to avoid mixups in the code.
43 using LoopOrd = unsigned;
44 
45 // A compressed <tensor id, level> pair.
46 using TensorLevel = unsigned;
47 //===----------------------------------------------------------------------===//
48 // SparseTensorLoopEmiter class, manages sparse tensors and helps to
49 // generate loop structure to (co)-iterate sparse tensors.
50 //
51 // An example usage:
52 // To generate the following loops over T1<?x?> and T2<?x?>
53 //
54 // for i in TENSOR_1_0 {
55 // for j : TENSOR_2_0 {
56 // for k : TENSOR_1_1 {}
57 // for k : TENSOR_2_1 {}
58 // }
59 // }
60 //
61 // One can use
62 //
63 // LoopEmiter loopEmiter({T1, T1});
64 // loopEmiter.initializeLoopEmit();
65 // loopEmiter.enterLoopOverTensorAtLvl(T1, 0);
66 // loopEmiter.enterLoopOverTensorAtLvl(T2, 0);
67 // loopEmiter.enterLoopOverTensorAtLvl(T1, 1);
68 // loopEmiter.exitCurrentLoop();
69 // loopEmiter.enterLoopOverTensorAtLvl(T2, 1);
70 // loopEmiter.exitCurrentLoop(); // exit k
71 // loopEmiter.exitCurrentLoop(); // exit j
72 // loopEmiter.exitCurrentLoop(); // exit i
73 //===----------------------------------------------------------------------===//
74 
75 class LoopEmitter {
76 public:
77  /// Optional callback function to setup dense output tensors when
78  /// initializing the loop emitter (e.g., to fill a dense output with zeros).
80  Value memref, Value tensor)>;
81 
82  /// Optional callback function to set the bound for the synthetic tensor,
83  /// which essentially is the dense loop bound.
85  function_ref<Value(OpBuilder &builder, Location loc, Level lvl)>;
86 
87  // Map from [tid, lvl] to a list of dependent [tidlvl, coeffecient] for
88  // subscript expressions on sparse tensors.
89  //
90  // E.g., for affine index (2 * d0 + d1), it depends on two tidlvls that
91  // defines d0 and d1 (for affine expression reduction) and uses 2 and 1 for
92  // cofficients on d0, d1 respectively.
93  // If the list is empty, it means that there is no affine expression on the
94  // input [tid, lvl].
95  //
96  // NOTE: The caller is responsible to ensure that the order of the returned
97  // list to be consistent with the topological order of the iteration graph,
98  // otherwise the loop emitter might reduce a wrong dependent index variable
99  // when generating slice-driven loops.
102  Level)>;
103 
104  LoopEmitter() = default;
105 
106  /// Takes an array of input tensors, which the generated loops will
107  /// iterate over. Each tensor is given a `TensorId` (numerically equal
108  /// to the position of that tensor `Value` in the array). Setting
109  /// `isSparseOut` indicates that the sparse output tensor is empty,
110  /// so the loop emitter will generate loops over it according to the
111  /// level-sizes. The `topSort` array specifies the actual order in
112  /// which loops are generated, thus providing a mapping from `LoopOrd`
113  /// to `LoopId`.
114  void initialize(ValueRange tensors, StringAttr loopTag = nullptr,
115  bool hasOutput = false, bool isSparseOut = false,
116  unsigned numLoops = 0, DependentLvlGetter getter = nullptr);
117 
118  explicit LoopEmitter(ValueRange tensors, StringAttr loopTag = nullptr,
119  bool hasOutput = false, bool isSparseOut = false,
120  unsigned numLoops = 0,
121  DependentLvlGetter getter = nullptr);
122 
123  /// Starts a loop emitting session by generating all the buffers needed
124  /// for iterating over the tensors.
125  void initializeLoopEmit(OpBuilder &builder, Location loc,
126  OutputUpdater updater = nullptr,
127  SynTensorBoundSetter synSetter = nullptr);
128 
129  /// Generates code to compute an affine expression whose variables are
130  /// `LoopId`s (i.e., `a.cast<AffineDimExpr>().getPosition()` is a valid
131  /// `LoopId`).
132  Value genAffine(OpBuilder &builder, Location loc, AffineExpr a);
133 
134  /// Enters a new loop sequence, the loops within the same sequence starts
135  /// from the break points of previous loop instead of starting over from 0.
136  /// e.g.,
137  /// {
138  /// // loop sequence start.
139  /// p0 = while(xxx)
140  /// ...
141  /// break p0
142  ///
143  /// // Starts loop from p0
144  /// for (i = p0; i < end; i++)
145  /// ...
146  /// // loop sequence end.
147  /// }
148  void enterNewLoopSeq(OpBuilder &builder, Location loc,
149  ArrayRef<TensorLevel> tidLvls);
150 
151  /// Exits the current loop sequence, this will reset universal index to 0.
152  void exitCurrentLoopSeq(OpBuilder &builder, Location loc);
153 
154  /// Enters a loop that tries to locate a coordinates in a sparse level based
155  /// on the value evaluated by the provided affine expression.
156  /// DEPRECATED: affine index expression should be handled by index reduction
157  /// loop, filter loop-based solution is slow.
159  TensorId tid, Level lvl,
160  AffineExpr affine,
161  MutableArrayRef<Value> reduc = {});
162 
163  /// Emits the address for a dense level based on the value evaluated by the
164  /// provided affine expression.
165  /// DEPRECATED: affine index expression should be handled by index reduction
166  /// loop, filter loop-based solution is slow.
167  void genDenseAffineAddress(OpBuilder &builder, Location loc,
168  TensorLevel tidLvl, AffineExpr lvlExpr);
169 
170  // TODO: Get rid of `lvls` in the argument list? Track the level we
171  // are currently at internally. Then it would be enterNextLvlForTensor.
172  // Still need a way to specify the lvl for non-annotated tensors though,
173  // as those can be accessed out of order.
174  //
175  /// Emits a co-iteration loop over a set of tensors.
176  /// Emits loop over tensor_tid_lvl, it assumes that loops between
177  /// tensor_tid_[0, lvl - 1] have already been generated.
178  /// The function will also perform in-place update on the `reduc` vector to
179  /// return the reduction variable used inside the generated loop.
181  OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
182  MutableArrayRef<Value> reduc = {}, bool isParallel = false,
183  bool genDedup = false, bool needsUniv = false);
184 
185  /// Generates code to exit the current loop (e.g., generates yields, forwards
186  /// loop induction variables, etc).
187  void exitCurrentLoop(RewriterBase &rewriter, Location loc,
188  MutableArrayRef<Value> reduc = {});
189 
190  /// Get the range of values for all induction variables.
191  auto getLoopIVsRange() const {
192  return llvm::map_range(loopStack, [](const LoopInfo &li) { return li.iv; });
193  }
194 
195  /// Fills the out-parameter with the loop induction variables for all
196  /// loops in the current loop-stack. The variables are given in the
197  /// same order as the loop-stack, hence `ivs` should be indexed into
198  /// by `LoopOrd` (not `LoopId`).
200  return llvm::to_vector(getLoopIVsRange());
201  }
202 
203  /// Gets the current depth of the loop-stack. The result is given
204  /// the type `LoopOrd` for the same reason as one-past-the-end iterators.
206  return llvm::range_size(getLoopIVsRange());
207  }
208 
209  /// Gets loop induction variable for the given `LoopOrd`.
211  if (n >= getCurrentDepth())
212  return Value();
213  auto it = getLoopIVsRange().begin();
214  std::advance(it, n);
215  return *it;
216  }
217 
218  /// Gets the total number of manifest tensors (excluding the synthetic
219  /// tensor).
220  unsigned getNumManifestTensors() const { return tensors.size(); }
221 
222  /// Gets the total number of tensors that loopEmitter is operating on.
223  unsigned getNumTensors() const {
224  // Manifest tensors with one synthetic tensor at the end.
225  return getNumManifestTensors() + 1;
226  }
227 
228  /// Gets the TensorId for synthetic tensor.
229  TensorId getSynTensorId() const { return tensors.size(); }
230 
231  /// Gets the TensorId for output tensor.
233  assert(hasOutput);
234  return getNumManifestTensors() - 1;
235  }
236 
237  /// Compresses a TensorId and Level into a TensorLevel.
239  return l * getNumTensors() + t;
240  }
241 
242  /// De-compresses a TensorLevel back to a pair of TensorId and Level.
243  std::pair<TensorId, Level> unpackTensorLevel(TensorLevel tidLvl) const {
244  unsigned nt = getNumTensors();
245  return std::make_pair(tidLvl % nt, tidLvl / nt);
246  }
247 
248  /// Converts a range of TensorLevel to a range of std::pair<TensorId, Level>
249  template <class ContainerTy>
250  auto unpackTensorLevelRange(ContainerTy &&c) const {
251  using EltTy = decltype(*c.begin());
252  static_assert(std::is_same_v<llvm::remove_cvref_t<EltTy>, TensorLevel>,
253  "Must be unpacking a TensorLevel range");
254  return llvm::map_range(std::forward<ContainerTy>(c), [this](EltTy tl) {
255  return this->unpackTensorLevel(tl);
256  });
257  }
258 
259  template <class ContainerTy>
260  auto unpackTensorLevelFromCondRange(ContainerTy &&c) const {
261  using EltTy = decltype(*c.begin());
262  static_assert(std::is_same_v<llvm::remove_cvref_t<EltTy>, TensorLvlCond>,
263  "Must be unpacking a TensorLvlCond range");
264  return unpackTensorLevelRange(
265  llvm::make_first_range(std::forward<ContainerTy>(c)));
266  }
267 
268  ///
269  /// Getters.
270  ///
271  const std::vector<std::vector<Value>> &getPosits() const { return posits; };
272  const std::vector<std::vector<Value>> &getCoords() const { return coords; };
273  const std::vector<std::vector<Value>> &getHighs() const { return highs; };
274  const std::vector<std::vector<Value>> &getPositionBuffers() const {
275  return positionsBuffers;
276  };
277  const std::vector<std::vector<Value>> &getCoordinateBuffers() const {
278  return coordinatesBuffers;
279  };
280  const std::vector<Value> &getValBuffer() const { return valBuffer; };
281 
282  constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName() {
283  return llvm::StringLiteral("Emitted from");
284  }
285 
286 private:
287  ///
288  /// Structure definitions that hold different kinds of loops information.
289  ///
290 
291  // A tuple that stored the slice-driven loop information.
292  struct SliceLoopInfo final {
293  SliceLoopInfo(TensorId tid, Level lvl, bool reduced)
294  : tid(tid), lvl(lvl), reduced(reduced) {}
295  TensorId tid;
296  Level lvl;
297  bool reduced;
298  };
299  // LoopInfo stores information of a loop generated by LoopEmitter. E.g.,
300  // the set of tensors levels that the loop is iterating over.
301  struct LoopInfo final {
302  LoopInfo(ArrayRef<TensorLevel> trivialTidLvls,
303  ArrayRef<SliceLoopInfo> sliceDrivenInfo, Operation *loop,
304  Block *userBlock, Value iv, StringAttr loopTag)
305  : trivialTidLvls(trivialTidLvls), sliceDrivenInfo(sliceDrivenInfo),
306  loop(loop), userCodeBlock(userBlock), iv(iv) {
307  // Attached a special tag to loop emitter generated loop.
308  if (loopTag)
309  loop->setAttr(LoopEmitter::getLoopEmitterLoopAttrName(), loopTag);
310  }
311  // The set of <tensor, lvl>, with *only* trivial index expressions, that are
312  // used as the condition for the generated loop. Extra information is
313  // required for levels with non-tivial index expressions, which is
314  // maintained by the sliceDrivenInfo array below.
315  const llvm::SmallVector<TensorLevel> trivialTidLvls;
316  // The set of <tensor, lvl>, with *only* non-trivial index expressions, that
317  // are used as the condition for the generated loop.
318  const llvm::SmallVector<SliceLoopInfo> sliceDrivenInfo;
319  const Operation *loop; // the loop operation
320  Block *const userCodeBlock; // the block holding users' generated code.
321  const Value iv; // the induction variable for the loop
322  };
323 
324  // SliceInfo stores information of an extracted slice for slice-driven loop.
325  // E.g., the in-scope SSA values for the minimum coordinates and offset for
326  // the slice, etc.
327  struct SliceInfo final {
328  // Note that we do not need to create a actual sparse tensor slice but
329  // instead only need to maintain the metadata of the slice.
330  SliceInfo(Value minCrd, Value offset, Value isNonEmpty,
331  std::optional<Level> slicedOnLvl, unsigned depth)
332  : minCrd(minCrd), offset(offset), isNonEmpty(isNonEmpty),
333  slicedOnLvl(slicedOnLvl), depth(depth) {
334  // TODO: use std::optional<pair<Level, minCrd>>
335  assert(!slicedOnLvl || minCrd);
336  }
337 
338  // Whether this is the tensor that has not yet been sliced.
339  bool isInitialTensor() const { return !slicedOnLvl.has_value(); }
340 
341  Value minCrd; // the minimum coordinate of the slice.
342  Value offset; // the *absolute* offset of the current slice.
343  Value isNonEmpty; // whether the slice is empty.
344  std::optional<Level> slicedOnLvl; // the level on which the slice is done
345  unsigned depth; // the depth (relative to dependentDimMap[tid][lvl]).
346  };
347 
348  ///
349  /// Enums for different kinds of loop conditions.
350  ///
351 
352  // The bit indicating whether the loop conditions is sparse.
353  static constexpr uint8_t kSparseCond = 1 << 3;
354  // The bit indicating whether the loop iterates over sparse tensor slices
355  // (i.e., with non-empty SliceDimAttr).
356  static constexpr uint8_t kSliceCond = 1 << 2;
357  // The bit indicating whether the loop iterates over tensor levels with
358  // non-trivial affine index reduction.
359  static constexpr uint8_t kAffineIdxCond = 1 << 1;
360  // The bit indicating whether the loop iterates over tensor levels with
361  // non-trivial affine index reduction, and it is not fully reduced.
362  static constexpr uint8_t kAffineIdxCondUnRed = 1 << 0;
363 
364  enum class LoopCondKind : uint8_t {
365  // Dense conditions.
366  DenseCond = 0,
367  DenseSliceCond = kSliceCond,
368  DenseAffineCond = kAffineIdxCond,
369  DenseAffineUnRedCond = kAffineIdxCond | kAffineIdxCondUnRed,
370  // Sparse Conditions.
371  SparseCond = kSparseCond,
372  SparseSliceCond = kSparseCond | kSliceCond,
373  SparseAffineCond = kSparseCond | kAffineIdxCond,
374  SparseAffineUnRedCond = kSparseCond | kAffineIdxCond | kAffineIdxCondUnRed,
375  };
376  using TensorLvlCond = std::pair<TensorLevel, LoopCondKind>;
377 
378  /// Sparse or dense loop condition.
379  static bool isSparseCond(LoopCondKind k) {
380  return static_cast<uint8_t>(k) & kSparseCond;
381  }
382  static bool isDenseCond(LoopCondKind k) { return !isSparseCond(k); }
383 
384  /// Whether loops over sparse tensor slices or sparse tensors.
385  static bool isSliceCond(LoopCondKind k) {
386  return static_cast<uint8_t>(k) & kSliceCond;
387  }
388 
389  /// Affine or trivial index expression loop condition.
390  static bool isAffineIdxCond(LoopCondKind k) {
391  return static_cast<uint8_t>(k) & kAffineIdxCond;
392  }
393  static bool isTrivalIdxCond(LoopCondKind k) { return !isAffineIdxCond(k); }
394 
395  /// Whether the affine index expression is fully reduced.
396  static bool isAffineIdxUnRedCond(LoopCondKind k) {
397  return isAffineIdxCond(k) && static_cast<uint8_t>(k) & kAffineIdxCondUnRed;
398  }
399  static bool isAffineIdxRedCond(LoopCondKind k) {
400  return isAffineIdxCond(k) && !isAffineIdxUnRedCond(k);
401  }
402 
403  // Whether the loop condition kind requires extra check inside the loop body.
404  // E.g., to iterate over sparse tensor slice, we need to check whether the
405  // current cooridnate is on the slice (e.g., due to stride) or not.
406  static bool isCondWithExtraCheck(LoopCondKind k) {
407  return isSparseCond(k) && (isSliceCond(k) || isAffineIdxUnRedCond(k));
408  }
409 
410  static LoopCondKind makeLoopCondKind(bool isSparse, bool isSlice,
411  bool isAffine, bool isUnRedu) {
412  assert(!isUnRedu || isAffine);
413  uint8_t bits = 0;
414  bits = isSparse ? bits | kSparseCond : bits;
415  bits = isSlice ? bits | kSliceCond : bits;
416  bits = isAffine ? bits | kAffineIdxCond : bits;
417  bits = isUnRedu ? bits | kAffineIdxCondUnRed : bits;
418  LoopCondKind kind = static_cast<LoopCondKind>(bits);
419 
420  // Sanity checks.
421  assert(isSparse == isSparseCond(kind));
422  assert(isSlice == isSliceCond(kind));
423  assert(isAffine == isAffineIdxCond(kind));
424  assert(isUnRedu == isAffineIdxUnRedCond(kind));
425  return kind;
426  }
427 
428  void categorizeLoopCondition(ArrayRef<TensorLevel> tidLvls,
429  SmallVectorImpl<TensorLvlCond> &dnConds,
430  SmallVectorImpl<TensorLvlCond> &spConds);
431 
432  ///
433  /// LoopEmitter internal helper functions.
434  ///
435 
436  using LoopBodyBuilder = llvm::function_ref<void(OpBuilder &, Location, Value,
437  MutableArrayRef<Value>)>;
438 
439  /// Whether the list of the sparse condition should be iterated by for loop.
440  bool shouldIteratedByForLoop(ArrayRef<TensorLvlCond> spConds, bool genDedup);
441 
442  /// Linearizes address for dense dimension (i.e., p = (i * d0) + j).
443  Value genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl,
444  Value iv);
445 
446  /// Generates the segment high for a non-unique level (to fast forward
447  /// duplicated coordinates). That is, it generates the code:
448  ///
449  /// crd = coordinates_tid_lvl[pos]
450  /// while (pos < pHi && coordinates_tid_lvl[pos] == crd)
451  /// pos++;
452  /// <return pos>;
453  Value genSegmentHigh(OpBuilder &builder, Location loc, TensorId tid,
454  Level lvl, Value pos, Value pHi);
455 
456  /// Generates instructions to compute the coordinate of tensors[tid][lvl]
457  /// under the current loop context. The final argument is the
458  /// collapsed-output level, whereas this function handles converting
459  /// that to the uncollapsed-input level
460  Value genSparseCrd(OpBuilder &builder, Location loc, TensorId tid,
461  Level dstLvl);
462 
463  /// Generates a predicate to determine whether the tranformed coordinates are
464  /// in the given slice.
465  /// Returns std::pair<Transformed coordinates, Predicate>
466  std::pair<Value, Value> genSliceLegitPredicate(OpBuilder &builder,
467  Location loc, Value crd,
468  TensorId tid, Level lvl);
469 
470  bool isSynTensor(TensorId tid) const { return tid == getSynTensorId(); }
471 
472  bool isOutputTensor(TensorId tid) const {
473  return hasOutput && tid == getOutTensorId();
474  }
475 
476  bool isSparseOutput(TensorId tid) const {
477  return isOutputTensor(tid) && isSparseOut;
478  }
479 
480  bool isValidLevel(TensorId tid, Level lvl) const {
481  return tid < lvlTypes.size() && lvl < lvlTypes[tid].size();
482  }
483 
484  /// Forwards the (conceptual) "tree iterator" when iterating over a fully
485  /// reduced slice created by index-reduction.
486  void forwardsReducedSliceLevelTreeIt(OpBuilder &builder, Location loc,
487  TensorId tid, Level lvl, Value fcnt);
488 
489  /// Prepares loop for iterating over `tensor[lvl]`, under the assumption
490  /// that `tensor[0...lvl-1]` loops have already been set up.
491  void prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
492  TensorId tid, Level lvl);
493 
494  /// Enter dense tensor levels. Since the dense tensor condition could be
495  /// optimized from the loop condition, we need to compute the
496  /// positions/coordinates inside the loop body.
497  void enterTensorsAtDenseLvls(OpBuilder &builder, Location loc,
498  ArrayRef<TensorLvlCond> dnConds, Value iv,
499  SmallVectorImpl<SliceLoopInfo> &sliceInfo);
500 
501  /// Emits a for loop to iterate over a tensor level with the provided
502  /// lower bound `lo` and upper bound `hi`. Apart from iterating just
503  /// single tensor level, for loops can be used for slice-driven loop on
504  /// dense level too.
505  /// Returns a pair: the loop generated and the value for the induction
506  /// variable.
507  std::pair<Operation *, Value>
508  emitForLoopOverTensorAtLvl(OpBuilder &builder, Location loc, TensorId tid,
509  Level lvl, Value lo, Value hi,
510  MutableArrayRef<Value> reduc, bool isParallel);
511 
512  /// Emits a while loop to co-iterate over a list of sparse condition, or
513  /// (complex) single sparse condition that can not be handled by for loop
514  /// (e.g., index reduction loop).
515  /// Returns a pair: the loop generated and the value for the induction
516  /// variable (which is the minimum coordinate of all the tensor that being
517  /// iterated).
518  std::pair<Operation *, Value>
519  emitWhileLoopOverTensorsAtLvls(OpBuilder &builder, Location loc,
520  ArrayRef<TensorLvlCond> spConds,
521  MutableArrayRef<Value> reduc, bool needsUniv);
522 
523  /// Generates the while loop condition for the given tensor level condition.
524  Value genWhileLoopConditions(OpBuilder &builder, Location loc, ValueRange ivs,
525  TensorLvlCond cond);
526 
527  /// Generates the while loop body for the given tensor level condition.
528  std::optional<Value> genWhileLoopBody(OpBuilder &builder, Location loc,
529  ValueRange ivs, TensorLvlCond cond);
530 
531  /// Generates the values (to forward the loop) if the extra check failes.
532  /// E.g., to iterate over a sparse tensor slice, we need:
533  ///
534  /// pos = onSlice(curCrd) ? pos : pos + 1
535  ///
536  /// to skip invalid coordinate that is included in the slice.
537  ValueRange genCheckedValue(OpBuilder &builder, Location loc, Value pred,
538  ValueRange curArg, TensorLvlCond cond);
539 
540  /// Exits a for loop, returns the reduction results, e.g.,
541  /// For sequential for loops:
542  /// %ret = for () {
543  /// ...
544  /// %val = addi %args, %c
545  /// yield %val
546  /// }
547  /// For parallel loops, the following generated code by users:
548  /// %ret = parallel () init(%args) {
549  /// ...
550  /// %val = op %args, %c
551  /// }
552  /// will be transformed into
553  /// %ret = parallel () init(%args) {
554  /// ...
555  /// scf.reduce(%c) bb0(%0, %1){
556  /// %val = op %0, %1
557  /// scf.reduce.return %val
558  /// }
559  /// }
560  /// NOTE: only one instruction will be moved into reduce block,
561  /// transformation will fail if multiple instructions are used to compute
562  /// the reduction value. Return %ret to user, while %val is provided by
563  /// users (`reduc`).
564  void exitForLoop(RewriterBase &rewriter, Location loc,
565  MutableArrayRef<Value> reduc);
566 
567  /// Exits a while loop, returns the reduction results.
568  void exitWhileLoop(OpBuilder &builder, Location loc,
569  MutableArrayRef<Value> reduc);
570 
571  //
572  // Slice-driven loop related methods.
573  //
574 
575  /// Retrieves the most recent slice on lvl. To reduce affine expression like
576  /// d0 + d1 + d2, we need two slices (one of size d1 + d2, and the other of
577  /// size d2). This methods returns the latter slice (of size d2).
578  const SliceInfo &getMostRecentSliceOnLvl(TensorId tid, Level lvl);
579 
580  /// Similar to getMostRecentSliceOnLvl, but yields error when the most recent
581  /// slice is not the final slice needed to fully reduced the dependencies.
582  const SliceInfo &getFinalSliceOnLvl(TensorId tid, Level lvl) {
583  const SliceInfo &info = getMostRecentSliceOnLvl(tid, lvl);
584  assert(info.depth == dependentLvlMap[tid][lvl].size() - 1);
585  return info;
586  }
587 
588  /// Get the remaining number of constraints needed to fully *resolve*
589  /// dependent levels on tensor[tid].
590  unsigned remDepOnLevel(TensorId tid, Level lvl) const;
591 
592  /// Whether the tid, lvl is fully *reduced*, i.e., the non-trivial index
593  /// expression has been reduced to a trivial one.
594  /// E.g., A[i + j] => A[i + 2] (j is reduced)
595  bool depFullyReduced(TensorId tid, Level lvl) const {
596  return remDepOnLevel(tid, lvl) == 1;
597  }
598 
599  /// Whether the tid, lvl is fully resolved, i.e., we entered the level already
600  /// (the index on that level is determined).
601  /// E.g., A[i + j] => A[2 + 3] (both i and j become invariants for inner
602  /// loops).
603  bool lvlFullyResolved(TensorId tid, Level lvl) const {
604  return remDepOnLevel(tid, lvl) == 0;
605  }
606 
607  /// Generates a whileOp to iterate over a subset of coordinates on tid on lvl
608  /// using the pHi and pLo provided, the loop break on the first coordinate
609  /// that exceeds the slice boundary (i.e., coord >= slice.offset +
610  /// slice.size).
611  std::pair<Operation *, ValueRange>
612  genSliceLvlTraverseLoop(OpBuilder &builder, Location loc, Value pLo,
613  Value pHi, Value offset, Value size, TensorId tid,
614  Level lvl, ValueRange userReduc,
615  LoopBodyBuilder bodyBuilder);
616 
617  /// Generates a nested loop that iterates over tid on all the coordinates on
618  /// lvl.
619  ValueRange genUnResolvedSliceTreeTraverse(
620  OpBuilder &builder, Location loc, TensorId tid,
621  ArrayRef<const SliceInfo *> unResLvls,
622  std::optional<std::pair<TensorId, Level>> firstResLvl,
623  ValueRange userReduc, LoopBodyBuilder bodyBuilder);
624 
625  /// Generates code to get the first non-empty slice of tid on lvl, when all
626  /// the previous level before `lvl` are resolved (or lvl is the first level).
627  ///
628  /// This is the simple case because the previous level are resolved into a
629  /// single node in the storage tree.
630  void genResolvedSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
631  Level lvl);
632 
633  /// Generates code to get the first non-empty slice of tid on lvl, when
634  /// the previous levels before `lvl` are unresolved
635  ///
636  /// This is the complex case because the previous levels corresponding to a
637  /// range of nodes in the storage tree.
638  void genUnResolvedSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
639  Level lvl);
640 
641  /// Invalidates the index kept in slice postion buffers (by setting it to
642  /// zero).
643  /// TODO: We should instead use an SSA value for the index.
644  void invalidateSliceIterIdx(OpBuilder &builder, Location loc, TensorId tid,
645  Level lvl);
646  /// Generates code to get the first non-empty slice of tid on lvl.
647  /// return true if has already been resolved.
648  bool genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl);
649 
650  /// Generates code to get the next non-empty slices of tid on lvl.
651  /// Returns a tuple of values for <NonEmpty, MinCrd, AbsOffset> (see
652  /// SliceInfo) respectively.
653  std::tuple<Value, Value, Value> genSliceNextInduction(OpBuilder &builder,
654  Location loc,
655  TensorId tid,
656  Level lvl);
657 
658  /// A optional string attribute that should be attached to the loop
659  /// generated by loop emitter, it might help following passes to identify
660  /// loops that operates on sparse tensors more easily.
661  StringAttr loopTag;
662  /// Whether the loop emitter needs to treat the last tensor as the output
663  /// tensor.
664  bool hasOutput;
665  bool isSparseOut;
666 
667  /// The insertion point to allocate top level local variables.
668  Operation *localInsertPos;
669 
670  //
671  // Fields which have `numTensor` many entries.
672  //
673  // TODO: switch to an AOS style to avoid any possible mismatches.
674  //
675 
676  /// Input and (optional) output tensors.
677  std::vector<Value> tensors;
678  /// Level-types for each `(TensorId, Level)` pair.
679  std::vector<std::vector<LevelType>> lvlTypes;
680  // Sparse iteration information for each `(TensorId, Level)` pair.
681  // These arrays are updated to remain current within the current loop.
682  // TODO: Clarify which of these are indexed by dstLvl vs srcLvl.
683  //
684  /// The collection of positions for a given element (one such collection
685  /// for each tensor). This is the position analogue of the "coords"
686  /// naming convention.
687  ///
688  /// FIXME: [CLARIFY_POSITS_LVL] It's unclear which levels are used
689  /// to index the `posits` array. On the one hand `genSparseCrd`
690  /// uses dstLvl; on the other hand `enterLoopOverTensorAtLvl`,
691  /// `prepareLoopOverTensorAtLvl`, and `enterCoIterationOverTensorsAtLvls`
692  /// uses srcLvl. So which is it?
693  std::vector<std::vector<Value>> posits;
694  /// The collection of coordinates for a given element (one such
695  /// collection for each tensor).
696  std::vector<std::vector<Value>> coords;
697  // The segment upper bound for non-uniques level after de-duplication.
698  std::vector<std::vector<Value>> segHi;
699  std::vector<std::vector<Value>> highs;
700  std::vector<std::vector<Value>> lvlSizes;
701  std::vector<std::vector<Value>> positionsBuffers; // to_positions
702  std::vector<std::vector<Value>> coordinatesBuffers; // to_coordinates
703  std::vector<Value> valBuffer; // to_value
704 
705  //
706  // Slice-driven loops related fields.
707  //
708 
709  /// Whether the sparse input is a slice.
710  std::vector<bool> isSparseSlices;
711  /// Values related to slices.
712  std::vector<std::vector<Value>> sliceOffsets;
713  std::vector<std::vector<Value>> sliceStrides;
714 
715  // Map from [tid, level] to a list of dependent [tidlevel, coefficient].
716  // See comments for `DependentLvlGetter`.
717  std::vector<std::vector<std::vector<std::pair<TensorLevel, unsigned>>>>
718  dependentLvlMap;
719 
720  // The cached position buffer for the slices, they serve the same purpose as
721  // ptrBuffer for compressed dimensions.
722  // But they always starts with the first pidx pointing to coord > slice.offset
723  // to avoid iteration from the beginning.
724  std::vector<std::vector<std::vector<Value>>> slicePosBuffer;
725 
726  // The (size, stride) for each conceptual slice used for index reduction
727  // loops.
728  std::vector<std::vector<std::vector<std::pair<Value, unsigned>>>> sliceMeta;
729 
730  // The number of reduced dependencies on a tensor level so far.
731  std::vector<std::vector<unsigned>> levelReducedDep;
732 
733  // sliceStack[tid] holds the generated slice stack on tid.
734  std::vector<std::vector<SliceInfo>> sliceStack;
735 
736  /// TODO: not yet used, it should track the current level for each tensor
737  /// to help eliminate `lvls` paramters from above APIs.
738  /// std::vector<Level> curLvl;
739 
740  //
741  // Fields which have at most `numLoops` many entries.
742  //
743 
744  /// Loop Stack, stores the information of all the nested loops that are
745  /// alive.
746  std::vector<LoopInfo> loopStack;
747 
748  // Loop Sequence Stack, stores the unversial index for the current loop
749  // sequence. and a list of tids which was taken sliced.
750  // TODO: maybe we should have a LoopSeqInfo
751  std::vector<std::pair<Value, std::vector<std::tuple<TensorId, Level, bool>>>>
752  loopSeqStack;
753 };
754 
755 } // namespace sparse_tensor
756 } // namespace mlir
757 
758 #endif // MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_
Base type for affine expression.
Definition: AffineExpr.h:68
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
This class helps build Operations.
Definition: Builders.h:206
Operation is the basic unit of execution within MLIR.
Definition: Operation.h:88
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:399
This class provides an abstraction over the different types of ranges over Values.
Definition: ValueRange.h:378
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
void exitCurrentLoop(RewriterBase &rewriter, Location loc, MutableArrayRef< Value > reduc={})
Generates code to exit the current loop (e.g., generates yields, forwards loop induction variables,...
constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName()
Definition: LoopEmitter.h:282
const std::vector< Value > & getValBuffer() const
Definition: LoopEmitter.h:280
void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef< TensorLevel > tidLvls)
Enters a new loop sequence, the loops within the same sequence starts from the break points of previo...
const std::vector< std::vector< Value > > & getCoordinateBuffers() const
Definition: LoopEmitter.h:277
void initialize(ValueRange tensors, StringAttr loopTag=nullptr, bool hasOutput=false, bool isSparseOut=false, unsigned numLoops=0, DependentLvlGetter getter=nullptr)
Takes an array of input tensors, which the generated loops will iterate over.
Value genAffine(OpBuilder &builder, Location loc, AffineExpr a)
Generates code to compute an affine expression whose variables are LoopIds (i.e., a....
TensorId getOutTensorId() const
Gets the TensorId for output tensor.
Definition: LoopEmitter.h:232
TensorLevel makeTensorLevel(TensorId t, Level l) const
Compresses a TensorId and Level into a TensorLevel.
Definition: LoopEmitter.h:238
unsigned getNumManifestTensors() const
Gets the total number of manifest tensors (excluding the synthetic tensor).
Definition: LoopEmitter.h:220
const std::vector< std::vector< Value > > & getPosits() const
Getters.
Definition: LoopEmitter.h:271
Operation * enterCoIterationOverTensorsAtLvls(OpBuilder &builder, Location loc, ArrayRef< TensorLevel > tidLvls, MutableArrayRef< Value > reduc={}, bool isParallel=false, bool genDedup=false, bool needsUniv=false)
Emits a co-iteration loop over a set of tensors.
const std::vector< std::vector< Value > > & getPositionBuffers() const
Definition: LoopEmitter.h:274
Operation * enterFilterLoopOverTensorAtLvl(OpBuilder &builder, Location loc, TensorId tid, Level lvl, AffineExpr affine, MutableArrayRef< Value > reduc={})
Enters a loop that tries to locate a coordinates in a sparse level based on the value evaluated by th...
std::pair< TensorId, Level > unpackTensorLevel(TensorLevel tidLvl) const
De-compresses a TensorLevel back to a pair of TensorId and Level.
Definition: LoopEmitter.h:243
auto unpackTensorLevelRange(ContainerTy &&c) const
Converts a range of TensorLevel to a range of std::pair<TensorId, Level>
Definition: LoopEmitter.h:250
unsigned getNumTensors() const
Gets the total number of tensors that loopEmitter is operating on.
Definition: LoopEmitter.h:223
SmallVector< Value > getLoopIVs() const
Fills the out-parameter with the loop induction variables for all loops in the current loop-stack.
Definition: LoopEmitter.h:199
Value getLoopIV(LoopOrd n) const
Gets loop induction variable for the given LoopOrd.
Definition: LoopEmitter.h:210
auto getLoopIVsRange() const
Get the range of values for all induction variables.
Definition: LoopEmitter.h:191
void initializeLoopEmit(OpBuilder &builder, Location loc, OutputUpdater updater=nullptr, SynTensorBoundSetter synSetter=nullptr)
Starts a loop emitting session by generating all the buffers needed for iterating over the tensors.
void genDenseAffineAddress(OpBuilder &builder, Location loc, TensorLevel tidLvl, AffineExpr lvlExpr)
Emits the address for a dense level based on the value evaluated by the provided affine expression.
auto unpackTensorLevelFromCondRange(ContainerTy &&c) const
Definition: LoopEmitter.h:260
void exitCurrentLoopSeq(OpBuilder &builder, Location loc)
Exits the current loop sequence, this will reset universal index to 0.
LoopOrd getCurrentDepth() const
Gets the current depth of the loop-stack.
Definition: LoopEmitter.h:205
TensorId getSynTensorId() const
Gets the TensorId for synthetic tensor.
Definition: LoopEmitter.h:229
const std::vector< std::vector< Value > > & getCoords() const
Definition: LoopEmitter.h:272
const std::vector< std::vector< Value > > & getHighs() const
Definition: LoopEmitter.h:273
unsigned TensorLevel
Definition: LoopEmitter.h:46
uint64_t Level
The type of level identifiers and level-ranks.
Definition: SparseTensor.h:38
unsigned LoopOrd
The position of a loop in the loop-stack, or the position of a LoopId in a topologically-sorted list ...
Definition: LoopEmitter.h:43
unsigned TensorId
Tensor identifiers, chosen to be the BlockArgument::getArgNumber of the value passed to Merger::build...
Definition: Merger.h:35
Include the generated interface declarations.