MLIR  15.0.0git
Sparsification.cpp
Go to the documentation of this file.
1 //===- Sparsification.cpp - Implementation of sparsification --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements converting sparse tensor types to actual sparse code.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "CodegenUtils.h"
14 
30 #include "mlir/IR/Matchers.h"
31 #include "mlir/IR/TensorEncoding.h"
32 #include "llvm/ADT/SmallBitVector.h"
33 
34 using namespace mlir;
35 using namespace mlir::sparse_tensor;
36 
37 //===----------------------------------------------------------------------===//
38 // Declarations of data structures.
39 //===----------------------------------------------------------------------===//
40 
41 namespace {
42 
43 // Iteration graph sorting.
44 enum SortMask {
45  kSparseOnly = 0x0,
46  kIncludeDense = 0x1,
47  kIncludeUndef = 0x2,
48  kIncludeAll = 0x3
49 };
50 
51 // Reduction kinds.
52 enum Reduction { kNoReduc, kSum, kProduct, kAnd, kOr, kXor };
53 
54 // Code generation.
55 struct CodeGen {
56  CodeGen(SparsificationOptions o, unsigned numTensors, unsigned numLoops,
57  OpOperand *op, unsigned nest)
58  : options(o), loops(numLoops), sizes(numLoops), buffers(numTensors),
59  pointers(numTensors, std::vector<Value>(numLoops)),
60  indices(numTensors, std::vector<Value>(numLoops)),
61  highs(numTensors, std::vector<Value>(numLoops)),
62  pidxs(numTensors, std::vector<Value>(numLoops)),
63  idxs(numTensors, std::vector<Value>(numLoops)), redVal(), sparseOut(op),
64  outerParNest(nest), lexIdx(), lexVal(), expValues(), expFilled(),
65  expAdded(), expCount(), curVecMask() {}
66  /// Sparsification options.
68  /// Universal dense indices and upper bounds (by index). The loops array
69  /// is updated with the value of the universal dense index in the current
70  /// loop. The sizes array is set once with the inferred dimension sizes.
71  std::vector<Value> loops;
72  std::vector<Value> sizes;
73  /// Buffers for storing dense and sparse numerical values (by tensor).
74  /// This array is set once during bufferization of all tensors.
75  std::vector<Value> buffers;
76  /// Sparse storage schemes (1-D): pointers and indices (by tensor and index).
77  /// This array is set once during bufferization of all sparse tensors.
78  std::vector<std::vector<Value>> pointers;
79  std::vector<std::vector<Value>> indices;
80  /// Sparse iteration information (by tensor and index). These arrays
81  /// are updated to remain current within the current loop.
82  std::vector<std::vector<Value>> highs;
83  std::vector<std::vector<Value>> pidxs;
84  std::vector<std::vector<Value>> idxs;
85  /// Current reduction, updated during code generation. When indices of a
86  /// reduction are exhausted, all inner loops can use a scalarized reduction.
87  unsigned redExp = -1u;
88  Value redVal;
89  Reduction redKind = kNoReduc;
90  // Sparse tensor as output. Implemented either through direct injective
91  // insertion in lexicographic index order (where indices are updated
92  // in the temporary array `lexIdx`) or through access pattern expansion
93  // in the innermost loop nest (`expValues` through `expCount`).
94  OpOperand *sparseOut;
95  unsigned outerParNest;
96  Value lexIdx;
97  Value lexVal;
98  Value expValues;
99  Value expFilled;
100  Value expAdded;
101  Value expCount;
102  // Current vector length and mask.
103  unsigned curVecLength = 1;
104  Value curVecMask;
105 };
106 
107 } // namespace
108 
109 //===----------------------------------------------------------------------===//
110 // Sparse compiler analysis methods.
111 //===----------------------------------------------------------------------===//
112 
113 /// Helper method to construct a permuted dimension ordering
114 /// that adheres to the given topological sort.
116  std::vector<unsigned> &topSort) {
117  unsigned sz = topSort.size();
118  assert(m.getNumResults() == sz && "TopoSort/AffineMap size mismatch");
119  // Construct the inverse of `m`; to avoid the asymptotic complexity
120  // of calling `m.getPermutedPosition` repeatedly.
121  SmallVector<unsigned, 4> inv(sz);
122  for (unsigned i = 0; i < sz; i++)
123  inv[i] = m.getDimPosition(i);
124  // Construct the permutation.
126  for (unsigned i = 0; i < sz; i++)
127  perm[i] = inv[topSort[i]];
128  return AffineMap::getPermutationMap(perm, context);
129 }
130 
131 /// Helper method to apply dimension ordering permutation.
132 static unsigned perm(const SparseTensorEncodingAttr &enc, unsigned d) {
133  if (enc) {
134  auto order = enc.getDimOrdering();
135  if (order) {
136  assert(order.isPermutation());
137  return order.getDimPosition(d);
138  }
139  }
140  return d;
141 }
142 
143 /// Helper method to translate dim level type to internal representation.
144 static Dim toDim(const SparseTensorEncodingAttr &enc, unsigned d) {
145  if (enc) {
146  SparseTensorEncodingAttr::DimLevelType tp = enc.getDimLevelType()[d];
147  if (tp == SparseTensorEncodingAttr::DimLevelType::Compressed)
148  return Dim::kSparse;
149  if (tp == SparseTensorEncodingAttr::DimLevelType::Singleton)
150  return Dim::kSingle;
151  }
152  return Dim::kDense;
153 }
154 
155 /// Helper method to inspect affine expressions. Rejects cases where the
156 /// same index is used more than once. Also rejects affine expressions
157 /// that are not a direct index for annotated tensors.
158 // TODO: accept more affine cases for sparse tensors
159 static bool findAffine(Merger &merger, unsigned tensor, AffineExpr a, Dim dim,
160  bool isDense) {
161  switch (a.getKind()) {
162  case AffineExprKind::DimId: {
163  unsigned idx = a.cast<AffineDimExpr>().getPosition();
164  if (!merger.isDim(tensor, idx, Dim::kUndef))
165  return false; // used more than once
166  merger.setDim(tensor, idx, dim);
167  return true;
168  }
169  case AffineExprKind::Add:
170  case AffineExprKind::Mul: {
171  if (!isDense)
172  return false;
173  auto binOp = a.cast<AffineBinaryOpExpr>();
174  return findAffine(merger, tensor, binOp.getLHS(), dim, isDense) &&
175  findAffine(merger, tensor, binOp.getRHS(), dim, isDense);
176  }
178  return isDense;
179  default:
180  return false;
181  }
182 }
183 
184 /// Helper method to inspect sparse encodings in the tensor types.
185 /// Fills the per-dimension sparsity information for all tensors.
186 /// Returns true if the sparse annotations and affine subscript
187 /// expressions of all tensors are admissable. Returns false if
188 /// no annotations are found or inadmissable constructs occur.
189 static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
190  bool annotated = false;
191  for (OpOperand *t : op.getInputAndOutputOperands()) {
192  auto map = op.getTiedIndexingMap(t);
193  auto enc = getSparseTensorEncoding(t->get().getType());
194  if (enc)
195  annotated = true;
196  assert(map.getNumResults() == op.getRank(t));
197  for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
198  unsigned tensor = t->getOperandNumber();
199  AffineExpr a = map.getResult(perm(enc, d));
200  if (!findAffine(merger, tensor, a, toDim(enc, d), !enc))
201  return false; // inadmissable affine expression
202  }
203  }
204  return annotated;
205 }
206 
207 /// A DFS helper to compute a topological sort. Note that recursion is
208 /// bounded by the number of implicit loops, which is always small.
209 /// Returns false when a cycle is detected.
210 static bool topSortDFS(unsigned i, std::vector<unsigned> &visit,
211  std::vector<unsigned> &topSort,
212  std::vector<std::vector<bool>> &adjM) {
213  if (visit[i] != 0)
214  return visit[i] != 1; // 1 denotes cycle!
215  visit[i] = 1;
216  for (unsigned j = 0, e = visit.size(); j < e; j++)
217  if (adjM[i][j])
218  if (!topSortDFS(j, visit, topSort, adjM))
219  return false;
220  visit[i] = 2;
221  topSort.push_back(i);
222  return true;
223 }
224 
225 /// Helper method to add all constraints from the indices in one affine
226 /// expression before all indices in the other affine expression. For
227 /// example i0+i1 < i2+i3+1 yields i0<i2, i0<i3, i1<i2, and i1<i3.
228 static void addAffineOrderings(std::vector<std::vector<bool>> &adjM,
229  AffineExpr a, AffineExpr b, unsigned fidx) {
230  switch (a.getKind()) {
231  case AffineExprKind::DimId: {
232  unsigned idx = a.cast<AffineDimExpr>().getPosition();
233  if (b)
234  addAffineOrderings(adjM, b, AffineExpr(), idx);
235  else
236  adjM[fidx][idx] = true;
237  break;
238  }
239  case AffineExprKind::Add:
240  case AffineExprKind::Mul: {
241  auto binOp = a.cast<AffineBinaryOpExpr>();
242  addAffineOrderings(adjM, binOp.getLHS(), b, fidx);
243  addAffineOrderings(adjM, binOp.getRHS(), b, fidx);
244  break;
245  }
246  default:
247  break;
248  }
249 }
250 
251 /// Computes a topologically sorted iteration graph for the linalg operation.
252 /// Ensures all tensors are visited in natural index order. This is essential
253 /// for sparse storage formats since these only support access along fixed
254 /// dimensions. Even for dense storage formats, however, the natural index
255 /// order yields innermost unit-stride access with better spatial locality.
256 static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
257  std::vector<unsigned> &topSort, unsigned mask,
258  OpOperand *skip = nullptr) {
259  // Set up an n x n from/to adjacency matrix of the iteration graph
260  // for the implicit loop indices i_0 .. i_n-1.
261  unsigned n = op.getNumLoops();
262  std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));
263 
264  // Iterate over the indexing maps of every tensor in the tensor expression.
265  for (OpOperand *t : op.getInputAndOutputOperands()) {
266  // Skip tensor during cycle resolution.
267  if (t == skip)
268  continue;
269  // Get map and encoding.
270  auto map = op.getTiedIndexingMap(t);
271  auto enc = getSparseTensorEncoding(t->get().getType());
272  assert(map.getNumDims() == n);
273  // Skip dense tensor constraints when not requested.
274  if (!(mask & SortMask::kIncludeDense) && !enc)
275  continue;
276  // Each tensor expression and optional dimension ordering (row-major
277  // by default) puts an ordering constraint on the loop indices. For
278  // example, the tensor expresion A_ijk forces the ordering i < j < k
279  // on the loop indices if no explicit dimension ordering is given.
280  for (unsigned d = 1, rank = map.getNumResults(); d < rank; d++) {
281  AffineExpr f = map.getResult(perm(enc, d - 1));
282  AffineExpr t = map.getResult(perm(enc, d));
283  addAffineOrderings(adjM, f, t, 0);
284  }
285  // Push unrelated loops into sparse iteration space, so these
286  // will be skipped more often.
287  if (mask & SortMask::kIncludeUndef) {
288  unsigned tensor = t->getOperandNumber();
289  for (unsigned i = 0; i < n; i++)
290  if (merger.isDim(tensor, i, Dim::kSparse))
291  for (unsigned j = 0; j < n; j++)
292  if (merger.isDim(tensor, j, Dim::kUndef))
293  adjM[i][j] = true;
294  }
295  }
296 
297  // Topologically sort the iteration graph to determine loop order.
298  // Report failure for a cyclic iteration graph.
299  topSort.clear();
300  topSort.reserve(n);
301  std::vector<unsigned> visit(n, 0);
302  for (unsigned i = 0; i < n; i++)
303  if (visit[i] == 0)
304  if (!topSortDFS(i, visit, topSort, adjM))
305  return false; // cycle!
306  std::reverse(std::begin(topSort), std::end(topSort));
307  return true;
308 }
309 
310 /// Returns true if tensor has an in-place annotation.
311 static bool isInPlace(Value val) {
312  if (auto arg = val.dyn_cast<BlockArgument>())
313  if (auto funcOp = dyn_cast<func::FuncOp>(arg.getOwner()->getParentOp()))
314  if (auto attr = funcOp.getArgAttrOfType<BoolAttr>(
315  arg.getArgNumber(),
316  bufferization::BufferizableOpInterface::kInplaceableAttrName))
317  return attr.getValue();
318  return false;
319 }
320 
321 /// Returns true if tensor materializes uninitialized into the computation.
322 static bool isMaterializing(Value val) {
323  return val.getDefiningOp<linalg::InitTensorOp>() ||
324  val.getDefiningOp<bufferization::AllocTensorOp>();
325 }
326 
327 /// Returns true when the tensor expression is admissable for codegen.
328 /// Since all sparse input tensors are admissable, we just need to check
329 /// whether the out tensor in the tensor expression codegen is admissable.
330 /// Sets `sparseOut` to the tensor and `outerParNest` to the outer injective
331 /// nesting depth when a "truly dynamic" sparse tensor output occurs.
332 static bool isAdmissableTensorExp(Merger &merger, linalg::GenericOp op,
333  std::vector<unsigned> &topSort, unsigned exp,
334  OpOperand **sparseOut,
335  unsigned &outerParNest) {
336  OpOperand *lhs = op.getOutputOperand(0);
337  unsigned tensor = lhs->getOperandNumber();
338  auto enc = getSparseTensorEncoding(lhs->get().getType());
339  // An non-annotated output tensor is assumed dense, and becomes a random
340  // access n-dim memref. Admissable since insertions cannot occur.
341  if (!enc)
342  return true;
343  // An all-dense annotated "sparse" output tensor becomes a linearized random
344  // access 1-dim memref. Also admissable since insertions cannot occur.
345  bool allDense = true;
346  auto iteratorTypes = op.iterator_types().getValue();
347  unsigned numLoops = iteratorTypes.size();
348  for (unsigned i = 0; i < numLoops; i++)
349  if (merger.isDim(tensor, i, Dim::kSparse)) {
350  allDense = false;
351  break;
352  }
353  if (allDense)
354  return true;
355  // A tensor expression with a sparse output tensor that changes its values
356  // but not its nonzero structure, an operation called "simply dynamic" in
357  // [Bik96,Ch9], is also admissable without special codegen, provided
358  // the tensor's underlying sparse storage scheme can be modified in-place.
359  if (merger.isSingleCondition(tensor, exp) && isInPlace(lhs->get()))
360  return true;
361  // Accept "truly dynamic" if the output tensor materializes uninitialized
362  // into the computation and insertions occur in lexicographic index order.
363  if (isMaterializing(lhs->get())) {
364  unsigned nest = 0;
365  for (unsigned i = 0; i < numLoops; i++) {
366  if (isReductionIterator(iteratorTypes[topSort[i]]))
367  break; // terminate at first reduction
368  nest++;
369  }
370  // Determine admissable dynamic insertion situations:
371  // (1) fully injective, since there are no reductions,
372  // (2) admissable 1-d expansion in innermost dimension.
373  if (nest >= op.getRank(lhs) - 1) {
374  *sparseOut = lhs;
375  outerParNest = nest;
376  return true;
377  }
378  }
379  return false;
380 }
381 
382 //===----------------------------------------------------------------------===//
383 // Sparse compiler synthesis methods (reductions).
384 //===----------------------------------------------------------------------===//
385 
386 /// Maps reduction kind to vector::CombiningKind.
387 static vector::CombiningKind getCombiningKind(Reduction kind) {
388  switch (kind) {
389  case kNoReduc:
390  break;
391  case kSum:
392  return vector::CombiningKind::ADD;
393  case kProduct:
394  return vector::CombiningKind::MUL;
395  case kAnd:
396  return vector::CombiningKind::AND;
397  case kOr:
398  return vector::CombiningKind::OR;
399  case kXor:
400  return vector::CombiningKind::XOR;
401  }
402  llvm_unreachable("unknown reduction kind");
403 }
404 
405 /// Maps operation to reduction.
407  switch (kind) {
408  case Kind::kAddF:
409  case Kind::kAddC:
410  case Kind::kAddI:
411  case Kind::kSubF:
412  case Kind::kSubC:
413  case Kind::kSubI:
414  return kSum;
415  case Kind::kMulF:
416  case Kind::kMulC:
417  case Kind::kMulI:
418  return kProduct;
419  case Kind::kAndI:
420  return kAnd;
421  case Kind::kOrI:
422  return kOr;
423  case Kind::kXorI:
424  return kXor;
425  default:
426  llvm_unreachable("unexpected reduction operator");
427  }
428 }
429 
430 /// Generates an initial value for a vector reduction, following the scheme
431 /// given in Chapter 5 of "The Software Vectorization Handbook", where the
432 /// initial scalar value is correctly embedded in the vector reduction value,
433 /// and a straightforward horizontal reduction will complete the operation.
434 static Value genVectorReducInit(CodeGen &codegen, OpBuilder &builder,
435  Location loc, VectorType vtp) {
436  Value r = codegen.redVal;
437  switch (codegen.redKind) {
438  case kNoReduc:
439  break;
440  case kSum:
441  case kXor:
442  // Initialize reduction vector to: | 0 | .. | 0 | r |
443  return builder.create<vector::InsertElementOp>(
444  loc, r, constantZero(builder, loc, vtp),
445  constantIndex(builder, loc, 0));
446  case kProduct:
447  // Initialize reduction vector to: | 1 | .. | 1 | r |
448  return builder.create<vector::InsertElementOp>(
449  loc, r, constantOne(builder, loc, vtp), constantIndex(builder, loc, 0));
450  case kAnd:
451  case kOr:
452  // Initialize reduction vector to: | r | .. | r | r |
453  return builder.create<vector::BroadcastOp>(loc, vtp, r);
454  }
455  llvm_unreachable("unknown reduction kind");
456 }
457 
458 /// Generates final value for a vector reduction.
459 static Value genVectorReducEnd(CodeGen &codegen, OpBuilder &builder,
460  Location loc, VectorType vtp) {
461  vector::CombiningKind kind = getCombiningKind(codegen.redKind);
462  return builder.create<vector::ReductionOp>(loc, kind, codegen.redVal);
463 }
464 
465 /// Updates scalarized reduction value.
466 static void updateReduc(Merger &merger, CodeGen &codegen, Value reduc) {
467  assert(codegen.redKind != kNoReduc);
468  codegen.redVal = merger.exp(codegen.redExp).val = reduc;
469 }
470 
471 //===----------------------------------------------------------------------===//
472 // Sparse compiler synthesis methods (statements and expressions).
473 //===----------------------------------------------------------------------===//
474 
475 /// Generates buffer for the output tensor. Note that all sparse kernels
476 /// assume that when all elements are written to (viz. x(i) = y(i) * z(i)),
477 /// the output buffer is already initialized to all zeroes and only nonzeroes
478 /// values are computed and written out. For updates (viz. x(i) += y(i) * z(i)),
479 /// only nonzeroes values are used for the updates and no assumption on the
480 /// original contents of the output buffer is necessary.
481 static Value genOutputBuffer(CodeGen &codegen, OpBuilder &builder,
482  linalg::GenericOp op, MemRefType denseTp,
483  ArrayRef<Value> args) {
484  Location loc = op.getLoc();
485  OpOperand *lhs = op.getOutputOperand(0);
486  Value tensor = lhs->get();
487  bool isInit = op.isInitTensor(lhs);
488  // An output tensor that is in-place can simply materialize from the buffer
489  // of the tensor that appears in the outs() clause. For updates, this has
490  // the advantage that only the nonzero value are involved in the computation,
491  // keeping the operation O(nnz). In all other cases, we are forced to zero
492  // out the buffer to enforce the assumption above, which may negatively
493  // impact running complexity (viz. O(n^2 + nnz) vs. O(nnz) for matrices).
494  // TODO: use better analysis to avoid zeroing out the buffer?
495  if (isInPlace(tensor)) {
496  Value init =
497  builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
498  if (!isInit) {
499  Value zero = constantZero(builder, loc, denseTp.getElementType());
500  builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{init});
501  }
502  return init;
503  }
504  // By default, a new buffer is allocated which is either set to zero (when
505  // no updates occur or the tensor materializes into this computation) or
506  // initialized to the value of the tensor defined in the outs() clause.
507  // This is always correct (since it enforces all assumptions above) but
508  // may negatively impact running complexity as explained above.
509  Value alloc = builder.create<memref::AllocOp>(loc, denseTp, args);
510  if (!isInit || isMaterializing(tensor)) {
511  Value zero = constantZero(builder, loc, denseTp.getElementType());
512  builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{alloc});
513  } else {
514  Value init =
515  builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
516  builder.create<memref::CopyOp>(loc, init, alloc);
517  }
518  return alloc;
519 }
520 
521 /// Local bufferization of all dense and sparse data structures.
522 /// This code enables testing the first prototype sparse compiler.
523 // TODO: replace this with a proliferated bufferization strategy
524 static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder,
525  linalg::GenericOp op) {
526  Location loc = op.getLoc();
527  assert(op.getNumInputsAndOutputs() == op.getNumInputs() + 1);
528  // For every tensor, find lower and upper bound on dimensions, set the
529  // same bounds on loop indices, and obtain dense or sparse buffer(s).
531  for (OpOperand *t : op.getInputAndOutputOperands()) {
532  unsigned tensor = t->getOperandNumber();
533  auto shape = op.getShape(t);
534  auto map = op.getTiedIndexingMap(t);
535  auto enc = getSparseTensorEncoding(t->get().getType());
536  // Scan all dimensions of current tensor.
537  args.clear();
538  for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
539  AffineExpr a = map.getResult(perm(enc, d));
540  if (a.getKind() != AffineExprKind::DimId)
541  continue; // compound
542  unsigned idx = a.cast<AffineDimExpr>().getPosition();
543  // Handle sparse storage schemes.
544  if (merger.isDim(tensor, idx, Dim::kSparse)) {
545  auto dynShape = {ShapedType::kDynamicSize};
546  auto ptrTp =
547  MemRefType::get(dynShape, getPointerOverheadType(builder, enc));
548  auto indTp =
549  MemRefType::get(dynShape, getIndexOverheadType(builder, enc));
550  Value dim = constantIndex(builder, loc, d);
551  // Generate sparse primitives to obtains pointer and indices.
552  codegen.pointers[tensor][idx] =
553  builder.create<ToPointersOp>(loc, ptrTp, t->get(), dim);
554  codegen.indices[tensor][idx] =
555  builder.create<ToIndicesOp>(loc, indTp, t->get(), dim);
556  }
557  // Find upper bound in current dimension.
558  unsigned p = perm(enc, d);
559  Value up = linalg::createOrFoldDimOp(builder, loc, t->get(), p);
560  if (ShapedType::isDynamic(shape[p]))
561  args.push_back(up);
562  assert(codegen.highs[tensor][idx] == nullptr);
563  codegen.sizes[idx] = codegen.highs[tensor][idx] = up;
564  }
565  // Perform the required bufferization. Dense inputs materialize
566  // from the input tensors. Dense outputs need special handling.
567  // Sparse inputs use sparse primitives to obtain the values.
568  // We also accept in-place all-dense annotated "sparse" outputs.
569  Type elementType = getElementTypeOrSelf(t->get().getType());
570  if (!enc) {
571  // Non-annotated dense tensors.
572  auto denseTp = MemRefType::get(shape, elementType);
573  if (tensor < op.getNumInputs())
574  codegen.buffers[tensor] =
575  builder.create<bufferization::ToMemrefOp>(loc, denseTp, t->get());
576  else
577  codegen.buffers[tensor] =
578  genOutputBuffer(codegen, builder, op, denseTp, args);
579  } else if (t == codegen.sparseOut) {
580  // True sparse output needs a lexIdx array.
581  Value rank = constantIndex(builder, loc, op.getRank(t));
582  auto dynShape = {ShapedType::kDynamicSize};
583  auto memTp = MemRefType::get(dynShape, builder.getIndexType());
584  codegen.lexIdx = builder.create<memref::AllocaOp>(loc, memTp, rank);
585  codegen.lexVal = builder.create<memref::AllocaOp>(
586  loc, MemRefType::get({}, elementType));
587  } else {
588  // Annotated sparse tensors.
589  auto dynShape = {ShapedType::kDynamicSize};
590  auto sparseTp = MemRefType::get(dynShape, elementType);
591  codegen.buffers[tensor] =
592  builder.create<ToValuesOp>(loc, sparseTp, t->get());
593  }
594  }
595 }
596 
597 /// Constructs vector type.
598 static VectorType vectorType(CodeGen &codegen, Type etp) {
599  unsigned numScalableDims = codegen.options.enableVLAVectorization;
600  return VectorType::get(codegen.curVecLength, etp, numScalableDims);
601 }
602 
603 /// Constructs vector type from pointer.
604 static VectorType vectorType(CodeGen &codegen, Value ptr) {
605  return vectorType(codegen, ptr.getType().cast<MemRefType>().getElementType());
606 }
607 
608 /// Constructs vector iteration mask.
609 static Value genVectorMask(CodeGen &codegen, OpBuilder &builder, Value iv,
610  Value lo, Value hi, Value step) {
611  Location loc = iv.getLoc();
612  VectorType mtp = vectorType(codegen, builder.getI1Type());
613  // Special case if the vector length evenly divides the trip count (for
614  // example, "for i = 0, 128, 16"). A constant all-true mask is generated
615  // so that all subsequent masked memory operations are immediately folded
616  // into unconditional memory operations.
617  IntegerAttr loInt, hiInt, stepInt;
618  if (matchPattern(lo, m_Constant(&loInt)) &&
619  matchPattern(hi, m_Constant(&hiInt)) &&
620  matchPattern(step, m_Constant(&stepInt))) {
621  if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0)
622  return builder.create<vector::BroadcastOp>(
623  loc, mtp, constantI1(builder, loc, true));
624  }
625  // Otherwise, generate a vector mask that avoids overrunning the upperbound
626  // during vector execution. Here we rely on subsequent loop optimizations to
627  // avoid executing the mask in all iterations, for example, by splitting the
628  // loop into an unconditional vector loop and a scalar cleanup loop.
629  auto minMap = AffineMap::get(
630  /*dimCount=*/2, /*symbolCount=*/1,
631  {builder.getAffineSymbolExpr(0),
632  builder.getAffineDimExpr(0) - builder.getAffineDimExpr(1)},
633  builder.getContext());
634  Value end =
635  builder.createOrFold<AffineMinOp>(loc, minMap, ValueRange{hi, iv, step});
636  return builder.create<vector::CreateMaskOp>(loc, mtp, end);
637 }
638 
639 /// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi].
640 static Value genVectorLoad(CodeGen &codegen, OpBuilder &builder, Value ptr,
641  ArrayRef<Value> args) {
642  Location loc = ptr.getLoc();
643  VectorType vtp = vectorType(codegen, ptr);
644  Value pass = constantZero(builder, loc, vtp);
645  if (args.back().getType().isa<VectorType>()) {
646  SmallVector<Value, 4> scalarArgs(args.begin(), args.end());
647  Value indexVec = args.back();
648  scalarArgs.back() = constantIndex(builder, loc, 0);
649  return builder.create<vector::GatherOp>(loc, vtp, ptr, scalarArgs, indexVec,
650  codegen.curVecMask, pass);
651  }
652  return builder.create<vector::MaskedLoadOp>(loc, vtp, ptr, args,
653  codegen.curVecMask, pass);
654 }
655 
656 /// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs.
657 static void genVectorStore(CodeGen &codegen, OpBuilder &builder, Value rhs,
658  Value ptr, ArrayRef<Value> args) {
659  Location loc = ptr.getLoc();
660  if (args.back().getType().isa<VectorType>()) {
661  SmallVector<Value, 4> scalarArgs(args.begin(), args.end());
662  Value indexVec = args.back();
663  scalarArgs.back() = constantIndex(builder, loc, 0);
664  builder.create<vector::ScatterOp>(loc, ptr, scalarArgs, indexVec,
665  codegen.curVecMask, rhs);
666  return;
667  }
668  builder.create<vector::MaskedStoreOp>(loc, ptr, args, codegen.curVecMask,
669  rhs);
670 }
671 
672 /// Generates a vectorized invariant. Here we rely on subsequent loop
673 /// optimizations to hoist the invariant broadcast out of the vector loop.
674 static Value genVectorInvariantValue(CodeGen &codegen, OpBuilder &builder,
675  Value val) {
676  VectorType vtp = vectorType(codegen, val.getType());
677  return builder.create<vector::BroadcastOp>(val.getLoc(), vtp, val);
678 }
679 
680 /// Generates an affine expression.
681 //
682 // TODO: generalize for sparse tensor subscripts
683 //
684 static Value genAffine(CodeGen &codegen, OpBuilder &builder, AffineExpr a,
685  Location loc) {
686  switch (a.getKind()) {
687  case AffineExprKind::DimId: {
688  unsigned idx = a.cast<AffineDimExpr>().getPosition();
689  return codegen.loops[idx]; // universal dense index
690  }
691  case AffineExprKind::Add: {
692  auto binOp = a.cast<AffineBinaryOpExpr>();
693  return builder.create<arith::AddIOp>(
694  loc, genAffine(codegen, builder, binOp.getLHS(), loc),
695  genAffine(codegen, builder, binOp.getRHS(), loc));
696  }
697  case AffineExprKind::Mul: {
698  auto binOp = a.cast<AffineBinaryOpExpr>();
699  return builder.create<arith::MulIOp>(
700  loc, genAffine(codegen, builder, binOp.getLHS(), loc),
701  genAffine(codegen, builder, binOp.getRHS(), loc));
702  }
704  int64_t c = a.cast<AffineConstantExpr>().getValue();
705  return constantIndex(builder, loc, c);
706  }
707  default:
708  llvm_unreachable("unexpected affine subscript");
709  }
710 }
711 
712 /// Generates index for load/store on sparse tensor.
713 static Value genIndex(CodeGen &codegen, linalg::GenericOp op, OpOperand *t) {
714  auto map = op.getTiedIndexingMap(t);
715  auto enc = getSparseTensorEncoding(t->get().getType());
716  AffineExpr a = map.getResult(perm(enc, map.getNumResults() - 1));
717  assert(a.getKind() == AffineExprKind::DimId);
718  unsigned idx = a.cast<AffineDimExpr>().getPosition();
719  return codegen.loops[idx];
720 }
721 
722 /// Generates subscript for load/store on a dense or sparse tensor.
723 static Value genSubscript(CodeGen &codegen, OpBuilder &builder,
724  linalg::GenericOp op, OpOperand *t,
725  SmallVector<Value, 4> &args) {
726  unsigned tensor = t->getOperandNumber();
727  auto map = op.getTiedIndexingMap(t);
728  auto enc = getSparseTensorEncoding(t->get().getType());
729  unsigned rank = map.getNumResults();
730  if (enc) {
731  // Note that currently, all sparse subscripts are simple.
732  // TODO: accept affine too?
733  AffineExpr a = map.getResult(perm(enc, rank - 1));
734  assert(a.getKind() == AffineExprKind::DimId);
735  unsigned idx = a.cast<AffineDimExpr>().getPosition();
736  assert(codegen.pidxs[tensor][idx] != nullptr);
737  args.push_back(codegen.pidxs[tensor][idx]); // position index
738  } else {
739  for (unsigned d = 0; d < rank; d++) {
740  AffineExpr a = map.getResult(perm(enc, d));
741  args.push_back(genAffine(codegen, builder, a, op.getLoc()));
742  }
743  }
744  return codegen.buffers[tensor];
745 }
746 
747 /// Generates insertion code to implement dynamic tensor load.
748 static Value genInsertionLoad(CodeGen &codegen, OpBuilder &builder,
749  linalg::GenericOp op, OpOperand *t) {
750  Location loc = op.getLoc();
751  // Direct lexicographic index order, tensor loads as zero.
752  if (!codegen.expValues) {
753  Type tp = getElementTypeOrSelf(t->get().getType());
754  return constantZero(builder, loc, tp);
755  }
756  // Load from expanded access pattern.
757  Value index = genIndex(codegen, op, t);
758  return builder.create<memref::LoadOp>(loc, codegen.expValues, index);
759 }
760 
761 /// Generates insertion code to implement dynamic tensor store.
762 static void genInsertionStore(CodeGen &codegen, OpBuilder &builder,
763  linalg::GenericOp op, OpOperand *t, Value rhs) {
764  Location loc = op.getLoc();
765  // Direct insertion in lexicographic index order.
766  if (!codegen.expValues) {
767  builder.create<memref::StoreOp>(loc, rhs, codegen.lexVal);
768  builder.create<LexInsertOp>(loc, t->get(), codegen.lexIdx, codegen.lexVal);
769  return;
770  }
771  // Generates insertion code along expanded access pattern.
772  // if (!expFilled[i]) then
773  // expFilled[i] = true
774  // expAdded[inserts++] = i
775  // endif
776  // values[i] = rhs
777  Value index = genIndex(codegen, op, t);
778  Value fval = constantI1(builder, loc, false);
779  Value tval = constantI1(builder, loc, true);
780  // If statement.
781  Value filled = builder.create<memref::LoadOp>(loc, codegen.expFilled, index);
782  Value cond = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
783  filled, fval);
784  scf::IfOp ifOp = builder.create<scf::IfOp>(loc, builder.getIndexType(), cond,
785  /*else=*/true);
786  // True branch.
787  builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
788  builder.create<memref::StoreOp>(loc, tval, codegen.expFilled, index);
789  builder.create<memref::StoreOp>(loc, index, codegen.expAdded,
790  codegen.expCount);
791  Value one = constantIndex(builder, loc, 1);
792  Value add = builder.create<arith::AddIOp>(loc, codegen.expCount, one);
793  builder.create<scf::YieldOp>(loc, add);
794  // False branch.
795  builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
796  builder.create<scf::YieldOp>(loc, codegen.expCount);
797  builder.setInsertionPointAfter(ifOp);
798  // Value assignment.
799  codegen.expCount = ifOp.getResult(0);
800  builder.create<memref::StoreOp>(loc, rhs, codegen.expValues, index);
801 }
802 
803 /// Generates a load on a dense or sparse tensor.
804 static Value genTensorLoad(Merger &merger, CodeGen &codegen, OpBuilder &builder,
805  linalg::GenericOp op, unsigned exp) {
806  // Test if the load was hoisted to a higher loop nest.
807  Value val = merger.exp(exp).val;
808  if (val) {
809  if (codegen.curVecLength > 1 && !val.getType().isa<VectorType>())
810  return genVectorInvariantValue(codegen, builder, val);
811  return val;
812  }
813  // Load during insertion.
814  OpOperand *t = op.getInputAndOutputOperands()[merger.exp(exp).tensor];
815  if (t == codegen.sparseOut)
816  return genInsertionLoad(codegen, builder, op, t);
817  // Actual load.
819  Value ptr = genSubscript(codegen, builder, op, t, args);
820  if (codegen.curVecLength > 1)
821  return genVectorLoad(codegen, builder, ptr, args);
822  return builder.create<memref::LoadOp>(op.getLoc(), ptr, args);
823 }
824 
825 /// Generates a store on a dense or sparse tensor.
826 static void genTensorStore(Merger &merger, CodeGen &codegen, OpBuilder &builder,
827  linalg::GenericOp op, unsigned exp, Value rhs) {
828  Location loc = op.getLoc();
829  // Test if this is a scalarized reduction.
830  if (codegen.redVal) {
831  if (codegen.curVecLength > 1)
832  rhs = builder.create<arith::SelectOp>(loc, codegen.curVecMask, rhs,
833  codegen.redVal);
834  updateReduc(merger, codegen, rhs);
835  return;
836  }
837  // Store during insertion.
838  OpOperand *t = op.getOutputOperand(0);
839  if (t == codegen.sparseOut) {
840  if (!rhs) {
841  // Only unary and binary are allowed to return uninitialized rhs
842  // to indicate missing output.
843  assert(merger.exp(exp).kind == kUnary || merger.exp(exp).kind == kBinary);
844  } else {
845  genInsertionStore(codegen, builder, op, t, rhs);
846  }
847  return;
848  }
849  // Actual store.
851  Value ptr = genSubscript(codegen, builder, op, t, args);
852  if (codegen.curVecLength > 1)
853  genVectorStore(codegen, builder, rhs, ptr, args);
854  else
855  builder.create<memref::StoreOp>(loc, rhs, ptr, args);
856 }
857 
858 /// Generates a pointer/index load from the sparse storage scheme. Narrower
859 /// data types need to be zero extended before casting the value into the
860 /// index type used for looping and indexing.
861 static Value genLoad(CodeGen &codegen, OpBuilder &builder, Location loc,
862  Value ptr, Value s) {
863  // See https://llvm.org/docs/GetElementPtr.html for some background on
864  // the complications described below.
865  if (codegen.curVecLength > 1) {
866  // Since the index vector is used in a subsequent gather/scatter operations,
867  // which effectively defines an unsigned pointer + signed index, we must
868  // zero extend the vector to an index width. For 8-bit and 16-bit values,
869  // an 32-bit index width suffices. For 32-bit values, zero extending the
870  // elements into 64-bit loses some performance since the 32-bit indexed
871  // gather/scatter is more efficient than the 64-bit index variant (if the
872  // negative 32-bit index space is unused, the enableSIMDIndex32 flag can
873  // preserve this performance). For 64-bit values, there is no good way
874  // to state that the indices are unsigned, with creates the potential of
875  // incorrect address calculations in the unlikely case we need such
876  // extremely large offsets.
877  Type etp = ptr.getType().cast<MemRefType>().getElementType();
878  Value vload = genVectorLoad(codegen, builder, ptr, {s});
879  if (!etp.isa<IndexType>()) {
880  if (etp.getIntOrFloatBitWidth() < 32)
881  vload = builder.create<arith::ExtUIOp>(
882  loc, vectorType(codegen, builder.getI32Type()), vload);
883  else if (etp.getIntOrFloatBitWidth() < 64 &&
884  !codegen.options.enableSIMDIndex32)
885  vload = builder.create<arith::ExtUIOp>(
886  loc, vectorType(codegen, builder.getI64Type()), vload);
887  }
888  return vload;
889  }
890  // For the scalar case, we simply zero extend narrower indices into 64-bit
891  // values before casting to index without a performance penalty. Here too,
892  // however, indices that already are 64-bit, in theory, cannot express the
893  // full range as explained above.
894  Value load = builder.create<memref::LoadOp>(loc, ptr, s);
895  if (!load.getType().isa<IndexType>()) {
896  if (load.getType().getIntOrFloatBitWidth() < 64)
897  load = builder.create<arith::ExtUIOp>(loc, builder.getI64Type(), load);
898  load =
899  builder.create<arith::IndexCastOp>(loc, builder.getIndexType(), load);
900  }
901  return load;
902 }
903 
904 /// Generates an invariant value.
905 static Value genInvariantValue(Merger &merger, CodeGen &codegen,
906  OpBuilder &builder, unsigned exp) {
907  Value val = merger.exp(exp).val;
908  if (codegen.curVecLength > 1)
909  return genVectorInvariantValue(codegen, builder, val);
910  return val;
911 }
912 
913 /// Generates an address computation "sz * p + i".
914 static Value genAddress(CodeGen &codegen, OpBuilder &builder, Location loc,
915  Value size, Value p, Value i) {
916  Value mul = builder.create<arith::MulIOp>(loc, size, p);
917  if (auto vtp = i.getType().dyn_cast<VectorType>()) {
918  Value inv =
919  builder.create<arith::IndexCastOp>(loc, vtp.getElementType(), mul);
920  mul = genVectorInvariantValue(codegen, builder, inv);
921  }
922  return builder.create<arith::AddIOp>(loc, mul, i);
923 }
924 
925 /// Generates an index value.
926 static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, unsigned idx,
927  unsigned ldx) {
928  Value ival = codegen.loops[idx];
929  Type itype = ival.getType();
930  // During vectorization, we either encounter:
931  // (1) indices already in vector form, as in ... = ind[lo:hi], good to go, or
932  // (2) single index, as in ... = i, must convert to [i, i+1, ...] for inner i.
933  unsigned vl = codegen.curVecLength;
934  if (vl > 1 && !itype.isa<VectorType>()) {
935  Location loc = ival.getLoc();
936  VectorType vtp = vectorType(codegen, itype);
937  ival = builder.create<vector::BroadcastOp>(loc, vtp, ival);
938  if (idx == ldx) {
939  Value incr;
940  if (vtp.isScalable()) {
941  Type stepvty = vectorType(codegen, builder.getI64Type());
942  Value stepv = builder.create<LLVM::StepVectorOp>(loc, stepvty);
943  incr = builder.create<arith::IndexCastOp>(loc, vtp, stepv);
944  } else {
945  SmallVector<APInt, 4> integers;
946  for (unsigned i = 0; i < vl; i++)
947  integers.push_back(APInt(/*width=*/64, i));
948  auto values = DenseElementsAttr::get(vtp, integers);
949  incr = builder.create<arith::ConstantOp>(loc, vtp, values);
950  }
951  ival = builder.create<arith::AddIOp>(loc, ival, incr);
952  }
953  }
954  return ival;
955 }
956 
957 /// Semi-ring branches are simply inlined by the sparse compiler. Prior
958 /// analysis has verified that all computations are "local" to the inlined
959 /// branch or otherwise invariantly defined outside the loop nest, with the
960 /// exception of index computations, which need to be relinked to actual
961 /// inlined cloned code.
962 static Value relinkBranch(CodeGen &codegen, RewriterBase &rewriter,
963  Block *block, Value e, unsigned ldx) {
964  if (Operation *def = e.getDefiningOp()) {
965  if (auto indexOp = dyn_cast<linalg::IndexOp>(def))
966  return genIndexValue(codegen, rewriter, indexOp.dim(), ldx);
967  if (def->getBlock() == block) {
968  for (unsigned i = 0, n = def->getNumOperands(); i < n; i++)
969  def->setOperand(
970  i, relinkBranch(codegen, rewriter, block, def->getOperand(i), ldx));
971  }
972  }
973  return e;
974 }
975 
976 /// Recursively generates tensor expression.
977 static Value genExp(Merger &merger, CodeGen &codegen, RewriterBase &rewriter,
978  linalg::GenericOp op, unsigned exp, unsigned ldx) {
979  Location loc = op.getLoc();
980  if (exp == -1u)
981  return Value();
982  if (merger.exp(exp).kind == Kind::kTensor)
983  return genTensorLoad(merger, codegen, rewriter, op, exp);
984  if (merger.exp(exp).kind == Kind::kInvariant)
985  return genInvariantValue(merger, codegen, rewriter, exp);
986  if (merger.exp(exp).kind == Kind::kIndex)
987  return genIndexValue(codegen, rewriter, merger.exp(exp).index, ldx);
988  Value v0 =
989  genExp(merger, codegen, rewriter, op, merger.exp(exp).children.e0, ldx);
990  Value v1 =
991  genExp(merger, codegen, rewriter, op, merger.exp(exp).children.e1, ldx);
992  Value ee = merger.buildExp(rewriter, loc, exp, v0, v1);
993  if (ee && (merger.exp(exp).kind == Kind::kUnary ||
994  merger.exp(exp).kind == Kind::kBinary ||
995  merger.exp(exp).kind == Kind::kBinaryBranch))
996  ee = relinkBranch(codegen, rewriter, ee.getParentBlock(), ee, ldx);
997  return ee;
998 }
999 
1000 /// Determines if affine expression is invariant.
1001 static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a,
1002  unsigned ldx, bool &atLevel) {
1003  switch (a.getKind()) {
1004  case AffineExprKind::DimId: {
1005  unsigned idx = a.cast<AffineDimExpr>().getPosition();
1006  if (idx == ldx)
1007  atLevel = true;
1008  return codegen.loops[idx] != nullptr; // no longer in play?
1009  }
1010  case AffineExprKind::Add:
1011  case AffineExprKind::Mul: {
1012  auto binOp = a.cast<AffineBinaryOpExpr>();
1013  return isInvariantAffine(codegen, binOp.getLHS(), ldx, atLevel) &&
1014  isInvariantAffine(codegen, binOp.getRHS(), ldx, atLevel);
1015  }
1016  default:
1017  return true;
1018  }
1019 }
1020 
1021 /// Hoists loop invariant tensor loads for which indices have been exhausted.
1022 static void genInvariants(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1023  linalg::GenericOp op, unsigned exp, unsigned ldx,
1024  bool atStart, Kind last = Kind::kTensor) {
1025  if (exp == -1u)
1026  return;
1027  if (merger.exp(exp).kind == Kind::kTensor) {
1028  // Inspect tensor indices.
1029  bool atLevel = ldx == -1u;
1030  OpOperand *t = op.getInputAndOutputOperands()[merger.exp(exp).tensor];
1031  auto map = op.getTiedIndexingMap(t);
1032  auto enc = getSparseTensorEncoding(t->get().getType());
1033  for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
1034  AffineExpr a = map.getResult(perm(enc, d));
1035  if (!isInvariantAffine(codegen, a, ldx, atLevel))
1036  return; // still in play
1037  }
1038  // All exhausted at this level (atLevel denotes exactly at this level).
1039  if (!atLevel)
1040  return;
1041  OpOperand *lhs = op.getOutputOperand(0);
1042  if (lhs == t) {
1043  // Start or end a scalarized reduction
1044  if (atStart) {
1045  Value load = genTensorLoad(merger, codegen, builder, op, exp);
1046  codegen.redKind = getReduction(last);
1047  codegen.redExp = exp;
1048  updateReduc(merger, codegen, load);
1049  } else {
1050  Value redVal = codegen.redVal;
1051  updateReduc(merger, codegen, Value());
1052  codegen.redExp = -1u;
1053  codegen.redKind = kNoReduc;
1054  genTensorStore(merger, codegen, builder, op, exp, redVal);
1055  }
1056  } else {
1057  // Start or end loop invariant hoisting of a tensor load.
1058  merger.exp(exp).val =
1059  atStart ? genTensorLoad(merger, codegen, builder, op, exp) : Value();
1060  }
1061  } else if (merger.exp(exp).kind != Kind::kInvariant &&
1062  merger.exp(exp).kind != Kind::kIndex) {
1063  // Traverse into the binary operations. Note that we only hoist
1064  // tensor loads, since subsequent MLIR/LLVM passes know how to
1065  // deal with all other kinds of derived loop invariants.
1066  Kind last = merger.exp(exp).kind;
1067  unsigned e0 = merger.exp(exp).children.e0;
1068  unsigned e1 = merger.exp(exp).children.e1;
1069  genInvariants(merger, codegen, builder, op, e0, ldx, atStart, last);
1070  genInvariants(merger, codegen, builder, op, e1, ldx, atStart, last);
1071  }
1072 }
1073 
1074 /// Generates an expanded access pattern in innermost dimension.
1075 static void genExpansion(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1076  linalg::GenericOp op, unsigned at, bool atStart) {
1077  OpOperand *lhs = codegen.sparseOut;
1078  if (!lhs || codegen.outerParNest != op.getRank(lhs) - 1 ||
1079  at != codegen.outerParNest)
1080  return; // not needed at this level
1081  // Generate start or end of an expanded access pattern.
1082  Value tensor = lhs->get();
1083  Location loc = op.getLoc();
1084  if (atStart) {
1085  auto dynShape = {ShapedType::kDynamicSize};
1086  Type etp = tensor.getType().cast<ShapedType>().getElementType();
1087  Type t1 = MemRefType::get(dynShape, etp);
1088  Type t2 = MemRefType::get(dynShape, builder.getI1Type());
1089  Type t3 = MemRefType::get(dynShape, builder.getIndexType());
1090  Type t4 = builder.getIndexType();
1091  auto res =
1092  builder.create<ExpandOp>(loc, TypeRange({t1, t2, t3, t4}), tensor);
1093  assert(res.getNumResults() == 4);
1094  assert(!codegen.expValues);
1095  codegen.expValues = res.getResult(0);
1096  codegen.expFilled = res.getResult(1);
1097  codegen.expAdded = res.getResult(2);
1098  codegen.expCount = res.getResult(3);
1099  } else {
1100  assert(codegen.expValues);
1101  builder.create<CompressOp>(loc, tensor, codegen.lexIdx, codegen.expValues,
1102  codegen.expFilled, codegen.expAdded,
1103  codegen.expCount);
1104  codegen.expValues = codegen.expFilled = codegen.expAdded =
1105  codegen.expCount = Value();
1106  }
1107 }
1108 
1109 /// Generates initialization code for the subsequent loop sequence at
1110 /// current index level. Returns true if the loop sequence needs to
1111 /// maintain the universal index.
1112 static bool genInit(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1113  linalg::GenericOp op, std::vector<unsigned> &topSort,
1114  unsigned at, BitVector &inits) {
1115  bool needsUniv = false;
1116  Location loc = op.getLoc();
1117  unsigned idx = topSort[at];
1118 
1119  // Initialize sparse positions.
1120  for (unsigned b = 0, be = inits.size(); b < be; b++) {
1121  if (inits[b]) {
1122  unsigned tensor = merger.tensor(b);
1123  assert(idx == merger.index(b));
1124  if (merger.isDim(b, Dim::kSparse)) {
1125  // Initialize sparse index.
1126  unsigned pat = at;
1127  for (; pat != 0; pat--) {
1128  if (codegen.pidxs[tensor][topSort[pat - 1]])
1129  break;
1130  }
1131  Value ptr = codegen.pointers[tensor][idx];
1132  Value one = constantIndex(builder, loc, 1);
1133  Value p0 = (pat == 0) ? constantIndex(builder, loc, 0)
1134  : codegen.pidxs[tensor][topSort[pat - 1]];
1135  codegen.pidxs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p0);
1136  Value p1 = builder.create<arith::AddIOp>(loc, p0, one);
1137  codegen.highs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p1);
1138  } else {
1139  // Dense index still in play.
1140  needsUniv = true;
1141  }
1142  }
1143  }
1144 
1145  // Initialize the universal dense index.
1146  codegen.loops[idx] = constantIndex(builder, loc, 0);
1147  return needsUniv;
1148 }
1149 
1150 /// Returns vectorization strategy. Any implicit inner loop in the Linalg
1151 /// operation is a candidate. Whether it is actually converted to SIMD code
1152 /// depends on the requested strategy.
1153 static bool isVectorFor(CodeGen &codegen, bool isInner, bool isReduction,
1154  bool isSparse) {
1155  // Reject vectorization of sparse output, unless innermost is reduction.
1156  if (codegen.sparseOut && !isReduction)
1157  return false;
1158  // Inspect strategy.
1159  switch (codegen.options.vectorizationStrategy) {
1161  return false;
1163  return isInner && !isSparse;
1165  return isInner;
1166  }
1167  llvm_unreachable("unexpected vectorization strategy");
1168 }
1169 
1170 /// Returns parallelization strategy. Any implicit loop in the Linalg operation
1171 /// that is marked "parallel" is a candidate. Whether it is actually converted
1172 /// to a parallel operation depends on the requested strategy.
1173 static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
1174  bool isSparse, bool isVector) {
1175  // Reject parallelization of sparse output.
1176  if (codegen.sparseOut)
1177  return false;
1178  // Inspect strategy.
1179  switch (codegen.options.parallelizationStrategy) {
1181  return false;
1183  return isOuter && !isSparse && !isReduction && !isVector;
1185  return isOuter && !isReduction && !isVector;
1187  return !isSparse && !isReduction && !isVector;
1189  return !isReduction && !isVector;
1190  }
1191  llvm_unreachable("unexpected parallelization strategy");
1192 }
1193 
1194 /// Checks unit stride for dense tensors. The iteration graph may have ignored
1195 /// dense access patterns in order to avoid cycles (sparse access patterns are
1196 /// always placed innermost), but that means dense access has become strided.
1197 /// This prevents effective vectorization.
1198 static bool denseUnitStrides(Merger &merger, linalg::GenericOp op,
1199  unsigned idx) {
1200  for (OpOperand *t : op.getInputAndOutputOperands()) {
1201  if (!getSparseTensorEncoding(t->get().getType())) {
1202  auto map = op.getTiedIndexingMap(t);
1203  for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
1204  AffineExpr a = map.getResult(d);
1205  // Report non-unit stride if innermost index appears at an outer
1206  // dimension (true non-unit stride) or if the innermost index appears
1207  // in a compound subscript in the innermost dimension. Even if the
1208  // latter is unit stride, it does not play well with scatter/gather.
1209  // TODO: accept unit stride affine innermost like a[i,j+k+1]?
1210  if (a.isFunctionOfDim(idx) &&
1211  ((d != rank - 1) || (a.getKind() != AffineExprKind::DimId)))
1212  return false;
1213  }
1214  }
1215  }
1216  return true;
1217 }
1218 
1219 /// Generates a for-loop on a single index.
1220 static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1221  linalg::GenericOp op, bool isOuter, bool isInner,
1222  unsigned idx, BitVector &indices) {
1223  unsigned fb = indices.find_first();
1224  unsigned tensor = merger.tensor(fb);
1225  assert(idx == merger.index(fb));
1226  auto iteratorTypes = op.iterator_types().getValue();
1227  bool isReduction = isReductionIterator(iteratorTypes[idx]);
1228  bool isSparse = merger.isDim(fb, Dim::kSparse);
1229  bool isVector = isVectorFor(codegen, isInner, isReduction, isSparse) &&
1230  denseUnitStrides(merger, op, idx);
1231  bool isParallel =
1232  isParallelFor(codegen, isOuter, isReduction, isSparse, isVector);
1233 
1234  // Prepare vector length.
1235  if (isVector)
1236  codegen.curVecLength = codegen.options.vectorLength;
1237 
1238  // Loop bounds and increment.
1239  Location loc = op.getLoc();
1240  Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx];
1241  Value hi = isSparse ? codegen.highs[tensor][idx] : codegen.sizes[idx];
1242  Value step = constantIndex(builder, loc, codegen.curVecLength);
1243  if (isVector && codegen.options.enableVLAVectorization) {
1244  Value vscale = builder.create<vector::VectorScaleOp>(
1245  loc, IndexType::get(builder.getContext()));
1246  step = builder.create<arith::MulIOp>(loc, vscale, step);
1247  }
1248 
1249  // Emit a parallel loop.
1250  if (isParallel) {
1251  assert(!isVector);
1252  scf::ParallelOp parOp = builder.create<scf::ParallelOp>(loc, lo, hi, step);
1253  if (isSparse)
1254  codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0];
1255  else
1256  codegen.loops[idx] = parOp.getInductionVars()[0];
1257  builder.setInsertionPointToStart(parOp.getBody());
1258  return parOp;
1259  }
1260 
1261  // Emit a sequential or vector loop.
1262  SmallVector<Value, 4> operands;
1263  if (codegen.redVal) {
1264  // In a vector loop, bring reduction into SIMD form, if not already.
1265  if (isVector && !codegen.redVal.getType().isa<VectorType>()) {
1266  VectorType vtp = vectorType(codegen, codegen.redVal.getType());
1267  Value vred = genVectorReducInit(codegen, builder, loc, vtp);
1268  updateReduc(merger, codegen, vred);
1269  }
1270  operands.push_back(codegen.redVal);
1271  }
1272  if (codegen.expValues)
1273  operands.push_back(codegen.expCount);
1274  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, operands);
1275  if (codegen.redVal)
1276  updateReduc(merger, codegen, forOp.getRegionIterArgs().front());
1277  if (codegen.expValues)
1278  codegen.expCount = forOp.getRegionIterArgs().back();
1279  // Assign induction variable to sparse or dense index.
1280  Value iv = forOp.getInductionVar();
1281  if (isSparse)
1282  codegen.pidxs[tensor][idx] = iv;
1283  else
1284  codegen.loops[idx] = iv;
1285  builder.setInsertionPointToStart(forOp.getBody());
1286  // Share vector iteration mask between all subsequent loads/stores.
1287  if (isVector)
1288  codegen.curVecMask = genVectorMask(codegen, builder, iv, lo, hi, step);
1289  return forOp;
1290 }
1291 
1292 /// Emit a while-loop for co-iteration over multiple indices.
1293 static Operation *genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1294  linalg::GenericOp op, unsigned idx, bool needsUniv,
1295  BitVector &indices) {
1296  SmallVector<Type, 4> types;
1297  SmallVector<Value, 4> operands;
1298  // Construct the while-loop with a parameter for each index.
1299  Type indexType = builder.getIndexType();
1300  for (unsigned b = 0, be = indices.size(); b < be; b++) {
1301  if (indices[b] && merger.isDim(b, Dim::kSparse)) {
1302  unsigned tensor = merger.tensor(b);
1303  assert(idx == merger.index(b));
1304  types.push_back(indexType);
1305  operands.push_back(codegen.pidxs[tensor][idx]);
1306  }
1307  }
1308  if (codegen.redVal) {
1309  types.push_back(codegen.redVal.getType());
1310  operands.push_back(codegen.redVal);
1311  }
1312  if (codegen.expValues) {
1313  types.push_back(indexType);
1314  operands.push_back(codegen.expCount);
1315  }
1316  if (needsUniv) {
1317  types.push_back(indexType);
1318  operands.push_back(codegen.loops[idx]);
1319  }
1320  assert(types.size() == operands.size());
1321  Location loc = op.getLoc();
1322  scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
1323 
1324  SmallVector<Location> locs(types.size(), loc);
1325  Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
1326  Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
1327 
1328  // Build the "before" region, which effectively consists
1329  // of a conjunction of "i < upper" tests on all induction.
1330  builder.setInsertionPointToStart(&whileOp.getBefore().front());
1331  Value cond;
1332  unsigned o = 0;
1333  for (unsigned b = 0, be = indices.size(); b < be; b++) {
1334  if (indices[b] && merger.isDim(b, Dim::kSparse)) {
1335  unsigned tensor = merger.tensor(b);
1336  assert(idx == merger.index(b));
1337  Value op1 = before->getArgument(o);
1338  Value op2 = codegen.highs[tensor][idx];
1339  Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
1340  op1, op2);
1341  cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc;
1342  codegen.pidxs[tensor][idx] = after->getArgument(o++);
1343  }
1344  }
1345  if (codegen.redVal)
1346  updateReduc(merger, codegen, after->getArgument(o++));
1347  if (codegen.expValues)
1348  codegen.expCount = after->getArgument(o++);
1349  if (needsUniv)
1350  codegen.loops[idx] = after->getArgument(o++);
1351  assert(o == operands.size());
1352  builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
1353  builder.setInsertionPointToStart(&whileOp.getAfter().front());
1354  return whileOp;
1355 }
1356 
1357 /// Generates a for-loop or a while-loop, depending on whether it implements
1358 /// singleton iteration or co-iteration over the given conjunction.
1359 static Operation *genLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1360  linalg::GenericOp op, std::vector<unsigned> &topSort,
1361  unsigned at, bool needsUniv, BitVector &indices) {
1362  unsigned idx = topSort[at];
1363  if (indices.count() == 1) {
1364  bool isOuter = at == 0;
1365  bool isInner = at == topSort.size() - 1;
1366  return genFor(merger, codegen, builder, op, isOuter, isInner, idx, indices);
1367  }
1368  return genWhile(merger, codegen, builder, op, idx, needsUniv, indices);
1369 }
1370 
1371 /// Generates the local variables for this loop, consisting of the sparse
1372 /// indices, restored universal dense index, and dense positions.
1373 static void genLocals(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1374  linalg::GenericOp op, std::vector<unsigned> &topSort,
1375  unsigned at, bool needsUniv, BitVector &locals) {
1376  Location loc = op.getLoc();
1377  unsigned idx = topSort[at];
1378 
1379  // Initialize sparse indices.
1380  Value min;
1381  for (unsigned b = 0, be = locals.size(); b < be; b++) {
1382  if (locals[b] && merger.isDim(b, Dim::kSparse)) {
1383  unsigned tensor = merger.tensor(b);
1384  assert(idx == merger.index(b));
1385  Value ptr = codegen.indices[tensor][idx];
1386  Value s = codegen.pidxs[tensor][idx];
1387  Value load = genLoad(codegen, builder, loc, ptr, s);
1388  codegen.idxs[tensor][idx] = load;
1389  if (!needsUniv) {
1390  if (min) {
1391  Value cmp = builder.create<arith::CmpIOp>(
1392  loc, arith::CmpIPredicate::ult, load, min);
1393  min = builder.create<arith::SelectOp>(loc, cmp, load, min);
1394  } else {
1395  min = load;
1396  }
1397  }
1398  }
1399  }
1400 
1401  // Merge dense universal index over minimum.
1402  if (min) {
1403  assert(!needsUniv);
1404  codegen.loops[idx] = min;
1405  }
1406 
1407  // Initialize dense positions. Note that we generate dense indices of the
1408  // output tensor unconditionally, since they may not appear in the lattice,
1409  // but may be needed for linearized codegen.
1410  for (unsigned b = 0, be = locals.size(); b < be; b++) {
1411  if ((locals[b] || merger.isOutTensor(b, idx)) &&
1412  merger.isDim(b, Dim::kDense)) {
1413  unsigned tensor = merger.tensor(b);
1414  assert(idx == merger.index(b));
1415  unsigned pat = at;
1416  for (; pat != 0; pat--)
1417  if (codegen.pidxs[tensor][topSort[pat - 1]])
1418  break;
1419  Value p = (pat == 0) ? constantIndex(builder, loc, 0)
1420  : codegen.pidxs[tensor][topSort[pat - 1]];
1421  codegen.pidxs[tensor][idx] = genAddress(
1422  codegen, builder, loc, codegen.sizes[idx], p, codegen.loops[idx]);
1423  }
1424  }
1425 
1426  // Move the insertion indices in lexicographic index order. During access
1427  // pattern expansion, we can skip setting the innermost dimension.
1428  if (codegen.sparseOut && !codegen.expValues) {
1429  Value pos = constantIndex(builder, loc, at);
1430  builder.create<memref::StoreOp>(loc, codegen.loops[idx], codegen.lexIdx,
1431  pos);
1432  }
1433 }
1434 
1435 /// Generates the induction structure for a while-loop.
1436 static void genWhileInduction(Merger &merger, CodeGen &codegen,
1437  OpBuilder &builder, linalg::GenericOp op,
1438  unsigned idx, bool needsUniv,
1439  BitVector &induction, scf::WhileOp whileOp) {
1440  Location loc = op.getLoc();
1441  // Finalize each else branch of all if statements.
1442  if (codegen.redVal || codegen.expValues) {
1443  while (auto ifOp = dyn_cast_or_null<scf::IfOp>(
1444  builder.getInsertionBlock()->getParentOp())) {
1445  unsigned y = 0;
1446  SmallVector<Value, 4> yields;
1447  if (codegen.redVal) {
1448  yields.push_back(codegen.redVal);
1449  updateReduc(merger, codegen, ifOp.getResult(y++));
1450  }
1451  if (codegen.expValues) {
1452  yields.push_back(codegen.expCount);
1453  codegen.expCount = ifOp->getResult(y++);
1454  }
1455  assert(y == yields.size());
1456  builder.create<scf::YieldOp>(loc, yields);
1457  builder.setInsertionPointAfter(ifOp);
1458  }
1459  }
1460  builder.setInsertionPointToEnd(&whileOp.getAfter().front());
1461  // Finalize the induction. Note that the induction could be performed
1462  // in the individual if-branches to avoid re-evaluating the conditions.
1463  // However, that would result in a rather elaborate forest of yield
1464  // instructions during code generation. Moreover, performing the induction
1465  // after the if-statements more closely resembles code generated by TACO.
1466  unsigned o = 0;
1467  SmallVector<Value, 4> operands;
1468  Value one = constantIndex(builder, loc, 1);
1469  for (unsigned b = 0, be = induction.size(); b < be; b++) {
1470  if (induction[b] && merger.isDim(b, Dim::kSparse)) {
1471  unsigned tensor = merger.tensor(b);
1472  assert(idx == merger.index(b));
1473  Value op1 = codegen.idxs[tensor][idx];
1474  Value op2 = codegen.loops[idx];
1475  Value op3 = codegen.pidxs[tensor][idx];
1476  Value cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
1477  op1, op2);
1478  Value add = builder.create<arith::AddIOp>(loc, op3, one);
1479  operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3));
1480  codegen.pidxs[tensor][idx] = whileOp->getResult(o++);
1481  }
1482  }
1483  if (codegen.redVal) {
1484  operands.push_back(codegen.redVal);
1485  updateReduc(merger, codegen, whileOp->getResult(o++));
1486  }
1487  if (codegen.expValues) {
1488  operands.push_back(codegen.expCount);
1489  codegen.expCount = whileOp->getResult(o++);
1490  }
1491  if (needsUniv) {
1492  operands.push_back(
1493  builder.create<arith::AddIOp>(loc, codegen.loops[idx], one));
1494  codegen.loops[idx] = whileOp->getResult(o++);
1495  }
1496  assert(o == operands.size());
1497  builder.create<scf::YieldOp>(loc, operands);
1498  builder.setInsertionPointAfter(whileOp);
1499 }
1500 
1501 /// Generates the induction structure for a for-loop.
1502 static void genForInduction(Merger &merger, CodeGen &codegen,
1503  OpBuilder &builder, linalg::GenericOp op,
1504  Operation *loop) {
1505  Location loc = op.getLoc();
1506  unsigned o = 0;
1507  SmallVector<Value, 4> operands;
1508  if (codegen.redVal) {
1509  operands.push_back(codegen.redVal);
1510  updateReduc(merger, codegen, loop->getResult(o++));
1511  }
1512  if (codegen.expValues) {
1513  operands.push_back(codegen.expCount);
1514  codegen.expCount = loop->getResult(o++);
1515  }
1516  assert(o == operands.size());
1517  if (o > 0)
1518  builder.create<scf::YieldOp>(loc, operands);
1519  builder.setInsertionPointAfter(loop);
1520 }
1521 
1522 /// Generates a single if-statement within a while-loop.
1523 static scf::IfOp genIf(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1524  linalg::GenericOp op, unsigned idx,
1525  BitVector &conditions) {
1526  Location loc = op.getLoc();
1527  SmallVector<Type, 4> types;
1528  Value cond;
1529  for (unsigned b = 0, be = conditions.size(); b < be; b++) {
1530  if (conditions[b]) {
1531  unsigned tensor = merger.tensor(b);
1532  assert(idx == merger.index(b));
1533  Value clause;
1534  if (merger.isDim(b, Dim::kSparse)) {
1535  Value op1 = codegen.idxs[tensor][idx];
1536  Value op2 = codegen.loops[idx];
1537  clause = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
1538  op1, op2);
1539  } else {
1540  clause = constantI1(builder, loc, true);
1541  }
1542  cond = cond ? builder.create<arith::AndIOp>(loc, cond, clause) : clause;
1543  }
1544  }
1545  if (codegen.redVal)
1546  types.push_back(codegen.redVal.getType());
1547  if (codegen.expValues)
1548  types.push_back(builder.getIndexType());
1549  scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, cond, /*else=*/true);
1550  builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
1551  return ifOp;
1552 }
1553 
1554 /// Generates end of true branch of if-statement within a while-loop.
1555 static void endIf(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1556  linalg::GenericOp op, scf::IfOp ifOp, Operation *loop,
1557  Value redInput, Value cntInput) {
1558  SmallVector<Value, 4> operands;
1559  if (codegen.redVal) {
1560  operands.push_back(codegen.redVal);
1561  updateReduc(merger, codegen, redInput);
1562  }
1563  if (codegen.expValues) {
1564  operands.push_back(codegen.expCount);
1565  codegen.expCount = cntInput;
1566  }
1567  if (!operands.empty())
1568  builder.create<scf::YieldOp>(op.getLoc(), operands);
1569  builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
1570 }
1571 
1572 //===----------------------------------------------------------------------===//
1573 // Sparse compiler synthesis methods (loop sequence).
1574 //===----------------------------------------------------------------------===//
1575 
1576 /// Starts a loop sequence at given level. Returns true if
1577 /// the universal loop index must be maintained at this level.
1578 static bool startLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1579  linalg::GenericOp op, std::vector<unsigned> &topSort,
1580  unsigned exp, unsigned at, unsigned idx, unsigned ldx,
1581  unsigned lts) {
1582  assert(codegen.curVecLength == 1);
1583  assert(!codegen.loops[idx]);
1584  // Emit invariants at this loop sequence level.
1585  genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/true);
1586  // Emit access pattern expansion for sparse tensor output.
1587  genExpansion(merger, codegen, builder, op, at, /*atStart=*/true);
1588  // Emit further intitialization at this loop sequence level.
1589  unsigned l0 = merger.set(lts)[0];
1590  bool needsUniv =
1591  genInit(merger, codegen, builder, op, topSort, at, merger.lat(l0).bits);
1592  // Maintain the universal index only if it is actually
1593  // consumed by a subsequent lattice point.
1594  if (needsUniv) {
1595  unsigned lsize = merger.set(lts).size();
1596  for (unsigned i = 1; i < lsize; i++) {
1597  unsigned li = merger.set(lts)[i];
1598  if (!merger.hasAnyDimOf(merger.lat(li).simple, Dim::kSparse))
1599  return true;
1600  }
1601  }
1602  return false;
1603 }
1604 
1605 /// Starts a single loop in current sequence.
1606 static Operation *startLoop(Merger &merger, CodeGen &codegen,
1607  OpBuilder &builder, linalg::GenericOp op,
1608  std::vector<unsigned> &topSort, unsigned at,
1609  unsigned li, bool needsUniv) {
1610  assert(codegen.curVecLength == 1);
1611  // Emit the for/while-loop control.
1612  Operation *loop = genLoop(merger, codegen, builder, op, topSort, at,
1613  needsUniv, merger.lat(li).simple);
1614  // Emit the locals for this loop.
1615  genLocals(merger, codegen, builder, op, topSort, at, needsUniv,
1616  merger.lat(li).bits);
1617  return loop;
1618 }
1619 
1620 /// Ends a single loop in current sequence. Returns new values for needsUniv.
1621 static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1622  linalg::GenericOp op, Operation *loop, unsigned idx,
1623  unsigned li, bool needsUniv) {
1624  codegen.curVecLength = 1;
1625  // End a while-loop.
1626  if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) {
1627  genWhileInduction(merger, codegen, builder, op, idx, needsUniv,
1628  merger.lat(li).bits, whileOp);
1629  return needsUniv;
1630  }
1631  // End a for-loop.
1632  genForInduction(merger, codegen, builder, op, loop);
1633  return false;
1634 }
1635 
1636 /// Ends a loop sequence at given level.
1637 static void endLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder,
1638  linalg::GenericOp op, unsigned exp, unsigned at,
1639  unsigned idx, unsigned ldx) {
1640  assert(codegen.curVecLength == 1);
1641  codegen.loops[idx] = Value();
1642  // Bring a pending reduction back from SIMD form when sequence ends.
1643  if (codegen.redVal)
1644  if (auto vtp = codegen.redVal.getType().dyn_cast<VectorType>())
1645  updateReduc(merger, codegen,
1646  genVectorReducEnd(codegen, builder, op.getLoc(), vtp));
1647  // Unmark bookkeeping of invariants and loop index.
1648  genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/false);
1649  // Finalize access pattern expansion for sparse tensor output.
1650  genExpansion(merger, codegen, builder, op, at, /*atStart=*/false);
1651 }
1652 
1653 /// Recursively generates code while computing iteration lattices in order
1654 /// to manage the complexity of implementing co-iteration over unions
1655 /// and intersections of sparse iterations spaces.
1656 static void genStmt(Merger &merger, CodeGen &codegen, RewriterBase &rewriter,
1657  linalg::GenericOp op, std::vector<unsigned> &topSort,
1658  unsigned exp, unsigned at) {
1659  // At each leaf, assign remaining tensor (sub)expression to output tensor.
1660  if (at == topSort.size()) {
1661  unsigned ldx = topSort[at - 1];
1662  Value rhs = genExp(merger, codegen, rewriter, op, exp, ldx);
1663  genTensorStore(merger, codegen, rewriter, op, exp, rhs);
1664  return;
1665  }
1666 
1667  // Construct iteration lattices for current loop index, with L0 at top.
1668  unsigned idx = topSort[at];
1669  unsigned ldx = at == 0 ? -1u : topSort[at - 1];
1670  unsigned lts = merger.optimizeSet(merger.buildLattices(exp, idx));
1671 
1672  // Start a loop sequence.
1673  bool needsUniv = startLoopSeq(merger, codegen, rewriter, op, topSort, exp, at,
1674  idx, ldx, lts);
1675 
1676  // Emit a loop for every lattice point L0 >= Li in this loop sequence.
1677  unsigned lsize = merger.set(lts).size();
1678  for (unsigned i = 0; i < lsize; i++) {
1679  // Start a loop.
1680  unsigned li = merger.set(lts)[i];
1681  Operation *loop =
1682  startLoop(merger, codegen, rewriter, op, topSort, at, li, needsUniv);
1683 
1684  // Visit all lattices points with Li >= Lj to generate the
1685  // loop-body, possibly with if statements for coiteration.
1686  Value redInput = codegen.redVal;
1687  Value cntInput = codegen.expCount;
1688  bool isWhile = dyn_cast<scf::WhileOp>(loop) != nullptr;
1689  for (unsigned j = 0; j < lsize; j++) {
1690  unsigned lj = merger.set(lts)[j];
1691  unsigned ej = merger.lat(lj).exp;
1692  if (li == lj || merger.latGT(li, lj)) {
1693  // Recurse into body of each branch.
1694  if (isWhile) {
1695  scf::IfOp ifOp =
1696  genIf(merger, codegen, rewriter, op, idx, merger.lat(lj).simple);
1697  genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1);
1698  endIf(merger, codegen, rewriter, op, ifOp, loop, redInput, cntInput);
1699  } else {
1700  genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1);
1701  }
1702  }
1703  }
1704 
1705  // End a loop.
1706  needsUniv =
1707  endLoop(merger, codegen, rewriter, op, loop, idx, li, needsUniv);
1708  }
1709 
1710  // End a loop sequence.
1711  endLoopSeq(merger, codegen, rewriter, op, exp, at, idx, ldx);
1712 }
1713 
1714 /// Converts the result computed by the sparse kernel into the required form.
1715 static void genResult(Merger &merger, CodeGen &codegen, RewriterBase &rewriter,
1716  linalg::GenericOp op) {
1717  OpOperand *lhs = op.getOutputOperand(0);
1718  Type resType = lhs->get().getType();
1719  if (getSparseTensorEncoding(resType)) {
1720  // The sparse tensor rematerializes from the original sparse tensor's
1721  // underlying sparse storage format.
1722  rewriter.replaceOpWithNewOp<LoadOp>(op, resType, lhs->get(),
1723  codegen.sparseOut == lhs);
1724  } else {
1725  // To rematerialize an non-annotated tensor, simply load it
1726  // from the bufferized value.
1727  Value val = codegen.buffers.back(); // value array
1728  rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, resType, val);
1729  }
1730 }
1731 
1732 //===----------------------------------------------------------------------===//
1733 // Sparse compiler rewriting methods.
1734 //===----------------------------------------------------------------------===//
1735 
1736 namespace {
1737 
1738 /// Sparse rewriting rule for generic Lingalg operation.
1739 struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
1740 public:
1741  GenericOpSparsifier(MLIRContext *context, SparsificationOptions o)
1743 
1744  LogicalResult matchAndRewrite(linalg::GenericOp op,
1745  PatternRewriter &rewriter) const override {
1746  // Detects sparse annotations and translate the per-dimension sparsity
1747  // information for all tensors to loop indices in the kernel.
1748  assert(op.getNumOutputs() == 1);
1749  unsigned numTensors = op.getNumInputsAndOutputs();
1750  unsigned numLoops = op.iterator_types().getValue().size();
1751  Merger merger(numTensors, numLoops);
1752  if (!findSparseAnnotations(merger, op))
1753  return failure();
1754 
1755  // Computes a topologically sorted iteration graph to ensure tensors
1756  // are visited in natural index order. Gradually relaxes the considered
1757  // constraints until an acyclic iteration graph results, such that sparse
1758  // code generation can proceed. As a last resort, an attempt is made
1759  // to resolve cycles by inserting a conversion.
1760  std::vector<unsigned> topSort;
1761  if (!computeIterationGraph(merger, op, topSort, SortMask::kIncludeAll) &&
1762  !computeIterationGraph(merger, op, topSort, SortMask::kIncludeUndef) &&
1763  !computeIterationGraph(merger, op, topSort, SortMask::kIncludeDense) &&
1764  !computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly)) {
1765  return resolveCycle(merger, rewriter, op);
1766  }
1767 
1768  // Builds the tensor expression for the Linalg operation in SSA form.
1769  Optional<unsigned> optExp = merger.buildTensorExpFromLinalg(op);
1770  if (!optExp.hasValue())
1771  return failure();
1772  unsigned exp = optExp.getValue();
1773 
1774  // Rejects an inadmissable tensor expression.
1775  OpOperand *sparseOut = nullptr;
1776  unsigned outerParNest = 0;
1777  if (!isAdmissableTensorExp(merger, op, topSort, exp, &sparseOut,
1778  outerParNest))
1779  return failure();
1780 
1781  // Recursively generates code.
1782  merger.setHasSparseOut(sparseOut != nullptr);
1783  CodeGen codegen(options, numTensors, numLoops, sparseOut, outerParNest);
1784  genBuffers(merger, codegen, rewriter, op);
1785  genStmt(merger, codegen, rewriter, op, topSort, exp, 0);
1786  genResult(merger, codegen, rewriter, op);
1787  return success();
1788  }
1789 
1790 private:
1791  // Last resort cycle resolution.
1792  LogicalResult resolveCycle(Merger &merger, PatternRewriter &rewriter,
1793  linalg::GenericOp op) const {
1794  // Compute topological sort while leaving out every
1795  // sparse input tensor in succession until an acylic
1796  // iteration graph results.
1797  std::vector<unsigned> topSort;
1798  for (OpOperand *t : op.getInputOperands()) {
1799  unsigned tensor = t->getOperandNumber();
1800  Value tval = t->get();
1801  auto srcEnc = getSparseTensorEncoding(tval.getType());
1802  if (!srcEnc ||
1803  !computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly, t))
1804  continue;
1805  // Found an input tensor that resolves the cycle by inserting a
1806  // conversion into a sparse tensor that adheres to the iteration
1807  // graph order. Also releases the temporary sparse tensor.
1808  //
1809  // TODO: investigate fusing the conversion with computation,
1810  // especially if it is a direct yield!
1811  //
1812  auto srcTp = tval.getType().cast<RankedTensorType>();
1813  auto dstEnc = SparseTensorEncodingAttr::get(
1814  op->getContext(), srcEnc.getDimLevelType(),
1815  permute(getContext(), op.getTiedIndexingMap(t), topSort), // new order
1816  srcEnc.getPointerBitWidth(), srcEnc.getIndexBitWidth());
1817  auto dstTp = RankedTensorType::get(srcTp.getShape(),
1818  srcTp.getElementType(), dstEnc);
1819  auto convert = rewriter.create<ConvertOp>(tval.getLoc(), dstTp, tval);
1820  op->setOperand(tensor, convert);
1821  rewriter.setInsertionPointAfter(op);
1822  rewriter.create<ReleaseOp>(tval.getLoc(), convert);
1823  return success();
1824  }
1825  // Cannot be resolved with a single conversion.
1826  // TODO: convert more than one?
1827  return failure();
1828  }
1829 
1830  /// Options to control sparse code generation.
1832 };
1833 
1834 } // namespace
1835 
1836 /// Populates the given patterns list with rewriting rules required for
1837 /// the sparsification of linear algebra operations.
1839  RewritePatternSet &patterns, const SparsificationOptions &options) {
1840  patterns.add<GenericOpSparsifier>(patterns.getContext(), options);
1841 }
static void genInsertionStore(CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, OpOperand *t, Value rhs)
Generates insertion code to implement dynamic tensor store.
Affine binary operation expression.
Definition: AffineExpr.h:207
Kind
Tensor expression kind.
Definition: Merger.h:27
TODO: Remove this file when SCCP and integer range analysis have been ported to the new framework...
static void genLocals(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, std::vector< unsigned > &topSort, unsigned at, bool needsUniv, BitVector &locals)
Generates the local variables for this loop, consisting of the sparse indices, restored universal den...
static bool topSortDFS(unsigned i, std::vector< unsigned > &visit, std::vector< unsigned > &topSort, std::vector< std::vector< bool >> &adjM)
A DFS helper to compute a topological sort.
static void updateReduc(Merger &merger, CodeGen &codegen, Value reduc)
Updates scalarized reduction value.
static bool genInit(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, std::vector< unsigned > &topSort, unsigned at, BitVector &inits)
Generates initialization code for the subsequent loop sequence at current index level.
MLIRContext * getContext() const
Definition: Builders.h:54
bool isOutTensor(unsigned b, unsigned i) const
Returns true if bit corresponds to index of output tensor.
Definition: Merger.h:230
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Definition: Builders.h:458
static bool isMaterializing(Value val)
Returns true if tensor materializes uninitialized into the computation.
A special type of RewriterBase that coordinates the application of a rewrite pattern on the current I...
Definition: PatternMatch.h:600
static Value genExp(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, linalg::GenericOp op, unsigned exp, unsigned ldx)
Recursively generates tensor expression.
Block * getInsertionBlock() const
Return the block the current insertion point belongs to.
Definition: Builders.h:380
Operation is a basic unit of execution within MLIR.
Definition: Operation.h:28
unsigned tensor(unsigned b) const
Bit translation.
Definition: Merger.h:223
bool isDim(unsigned b, Dim d) const
Returns true if bit corresponds to queried dim.
Definition: Merger.h:227
static Value genVectorInvariantValue(CodeGen &codegen, OpBuilder &builder, Value val)
Generates a vectorized invariant.
Operation * getParentOp()
Returns the closest surrounding operation that contains this block.
Definition: Block.cpp:30
static void endLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned exp, unsigned at, unsigned idx, unsigned ldx)
Ends a loop sequence at given level.
bool isFunctionOfDim(unsigned position) const
Return true if the affine expression involves AffineDimExpr position.
Definition: AffineExpr.cpp:280
static scf::IfOp genIf(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned idx, BitVector &conditions)
Generates a single if-statement within a while-loop.
TensorExp & exp(unsigned e)
Convenience getters to immediately access the stored nodes.
Definition: Merger.h:259
static bool computeIterationGraph(Merger &merger, linalg::GenericOp op, std::vector< unsigned > &topSort, unsigned mask, OpOperand *skip=nullptr)
Computes a topologically sorted iteration graph for the linalg operation.
Block represents an ordered list of Operations.
Definition: Block.h:29
AffineExpr getAffineSymbolExpr(unsigned position)
Definition: Builders.cpp:289
static AffineMap getPermutationMap(ArrayRef< unsigned > permutation, MLIRContext *context)
Returns an AffineMap representing a permutation.
Definition: AffineMap.cpp:205
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
Value constantIndex(OpBuilder &builder, Location loc, int64_t i)
Generates a constant of index type.
Definition: CodegenUtils.h:131
static Type getElementType(Type type, ArrayRef< int32_t > indices, function_ref< InFlightDiagnostic(StringRef)> emitErrorFn)
Walks the given type hierarchy with the given indices, potentially down to component granularity...
Definition: SPIRVOps.cpp:688
static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction, bool isSparse, bool isVector)
Returns parallelization strategy.
static unsigned perm(const SparseTensorEncodingAttr &enc, unsigned d)
Helper method to apply dimension ordering permutation.
Value constantZero(OpBuilder &builder, Location loc, Type tp)
Generates a 0-valued constant of the given type.
Definition: CodegenUtils.h:109
BitVector bits
Conjunction of tensor loop indices as bitvector.
Definition: Merger.h:133
bool latGT(unsigned i, unsigned j) const
Returns true if Li > Lj.
Definition: Merger.cpp:276
DimLevelType
This enum mimics SparseTensorEncodingAttr::DimLevelType for breaking dependency cycles.
BlockArgument getArgument(unsigned i)
Definition: Block.h:120
An integer constant appearing in affine expression.
Definition: AffineExpr.h:232
static Operation * genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned idx, bool needsUniv, BitVector &indices)
Emit a while-loop for co-iteration over multiple indices.
unsigned optimizeSet(unsigned s0)
Optimizes the iteration lattice points in the given set.
Definition: Merger.cpp:224
static Operation * genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, bool isOuter, bool isInner, unsigned idx, BitVector &indices)
Generates a for-loop on a single index.
unsigned exp
Index of the tensor expression.
Definition: Merger.h:141
SmallVector< unsigned, 16 > & set(unsigned s)
Definition: Merger.h:261
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:48
static Operation * startLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, std::vector< unsigned > &topSort, unsigned at, unsigned li, bool needsUniv)
Starts a single loop in current sequence.
void setInsertionPointAfter(Operation *op)
Sets the insertion point to the node after the specified operation, which will cause subsequent inser...
Definition: Builders.h:350
static Reduction getReduction(Kind kind)
Maps operation to reduction.
Value constantI1(OpBuilder &builder, Location loc, bool b)
Generates a constant of i1 type.
Definition: CodegenUtils.h:151
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:380
unsigned getOperandNumber()
Return which operand this is in the OpOperand list of the Operation.
Definition: Value.cpp:212
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
static void genVectorStore(CodeGen &codegen, OpBuilder &builder, Value rhs, Value ptr, ArrayRef< Value > args)
Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs.
LogicalResult failure(bool isFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:62
static AffineMap permute(MLIRContext *context, AffineMap m, std::vector< unsigned > &topSort)
Helper method to construct a permuted dimension ordering that adheres to the given topological sort...
static bool denseUnitStrides(Merger &merger, linalg::GenericOp op, unsigned idx)
Checks unit stride for dense tensors.
static void genInvariants(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned exp, unsigned ldx, bool atStart, Kind last=Kind::kTensor)
Hoists loop invariant tensor loads for which indices have been exhausted.
static Value genInvariantValue(Merger &merger, CodeGen &codegen, OpBuilder &builder, unsigned exp)
Generates an invariant value.
Type getElementTypeOrSelf(Type type)
Return the element type or return the type itself.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
static bool isAdmissableTensorExp(Merger &merger, linalg::GenericOp op, std::vector< unsigned > &topSort, unsigned exp, OpOperand **sparseOut, unsigned &outerParNest)
Returns true when the tensor expression is admissable for codegen.
static void endIf(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, scf::IfOp ifOp, Operation *loop, Value redInput, Value cntInput)
Generates end of true branch of if-statement within a while-loop.
U dyn_cast() const
Definition: Types.h:256
static void genExpansion(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned at, bool atStart)
Generates an expanded access pattern in innermost dimension.
Special case of IntegerAttr to represent boolean integers, i.e., signless i1 integers.
U dyn_cast() const
Definition: Value.h:100
unsigned tensor
Expressions representing tensors simply have a tensor number.
Definition: Merger.h:103
static Operation * genLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, std::vector< unsigned > &topSort, unsigned at, bool needsUniv, BitVector &indices)
Generates a for-loop or a while-loop, depending on whether it implements singleton iteration or co-it...
Reduction
Base type for affine expression.
Definition: AffineExpr.h:68
OpResult getResult(unsigned idx)
Get the &#39;idx&#39;th result of this operation.
Definition: Operation.h:331
This class provides an abstraction over the various different ranges of value types.
Definition: TypeRange.h:38
RHS of mul is always a constant or a symbolic expression.
IntegerType getI1Type()
Definition: Builders.cpp:50
Type getPointerOverheadType(Builder &builder, const SparseTensorEncodingAttr &enc)
Returns the mlir::Type for pointer overhead storage.
static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op)
Local bufferization of all dense and sparse data structures.
Kind kind
Tensor expression kind.
Definition: Merger.h:99
static void genForInduction(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, Operation *loop)
Generates the induction structure for a for-loop.
unsigned getNumResults() const
Definition: AffineMap.cpp:302
static bool findAffine(Merger &merger, unsigned tensor, AffineExpr a, Dim dim, bool isDense)
Helper method to inspect affine expressions.
static Value genVectorReducEnd(CodeGen &codegen, OpBuilder &builder, Location loc, VectorType vtp)
Generates final value for a vector reduction.
IRValueT get() const
Return the current value being used by this operand.
Definition: UseDefLists.h:133
static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, unsigned idx, unsigned ldx)
Generates an index value.
A multi-dimensional affine map Affine map&#39;s are immutable like Type&#39;s, and they are uniqued...
Definition: AffineMap.h:41
U cast() const
Definition: AffineExpr.h:291
static bool findSparseAnnotations(Merger &merger, linalg::GenericOp op)
Helper method to inspect sparse encodings in the tensor types.
LatPoint & lat(unsigned l)
Definition: Merger.h:260
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
Definition: Matchers.h:259
This class represents an argument of a Block.
Definition: Value.h:300
static Value genIndex(CodeGen &codegen, linalg::GenericOp op, OpOperand *t)
Generates index for load/store on sparse tensor.
static Value genLoad(CodeGen &codegen, OpBuilder &builder, Location loc, Value ptr, Value s)
Generates a pointer/index load from the sparse storage scheme.
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.
void setOperand(unsigned idx, Value value)
Definition: Operation.h:275
Location getLoc() const
Return the location of this value.
Definition: Value.cpp:26
Value constantOne(OpBuilder &builder, Location loc, Type tp)
Generates a 1-valued constant of the given type.
Definition: CodegenUtils.h:120
static Value genAffine(CodeGen &codegen, OpBuilder &builder, AffineExpr a, Location loc)
Generates an affine expression.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:72
SparseTensorEncodingAttr getSparseTensorEncoding(Type type)
Convenience method to get a sparse encoding attribute from a type.
static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound)
bool isReductionIterator(Attribute attr)
void setHasSparseOut(bool s)
Definition: Merger.h:253
SortMask
unsigned index(unsigned b) const
Definition: Merger.h:224
BitVector simple
Simplified conjunction of tensor loop indices as bitvector.
Definition: Merger.h:138
unsigned getDimPosition(unsigned idx) const
Extracts the position of the dimensional expression at the given result, when the caller knows it is ...
Definition: AffineMap.cpp:315
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:85
IntegerType getI64Type()
Definition: Builders.cpp:56
static void genStmt(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, linalg::GenericOp op, std::vector< unsigned > &topSort, unsigned exp, unsigned at)
Recursively generates code while computing iteration lattices in order to manage the complexity of im...
static llvm::ManagedStatic< PassManagerOptions > options
static bool isInPlace(Value val)
Returns true if tensor has an in-place annotation.
static Value genInsertionLoad(CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, OpOperand *t)
Generates insertion code to implement dynamic tensor load.
OpRewritePattern is a wrapper around RewritePattern that allows for matching and rewriting against an...
Definition: PatternMatch.h:355
AffineExprKind getKind() const
Return the classification for this type.
Definition: AffineExpr.cpp:25
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Definition: Builders.h:369
Dim
Dimension level type for a tensor (undef means index does not appear).
Definition: Merger.h:24
Type getType() const
Return the type of this value.
Definition: Value.h:118
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&... args)
Add an instance of each of the pattern types &#39;Ts&#39; to the pattern list with the given arguments...
IndexType getIndexType()
Definition: Builders.cpp:48
static Value relinkBranch(CodeGen &codegen, RewriterBase &rewriter, Block *block, Value e, unsigned ldx)
Semi-ring branches are simply inlined by the sparse compiler.
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replaces the result op with a new op that is created without verification.
Definition: PatternMatch.h:451
static bool isVectorFor(CodeGen &codegen, bool isInner, bool isReduction, bool isSparse)
Returns vectorization strategy.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
Definition: Matchers.h:333
A dimensional identifier appearing in an affine expression.
Definition: AffineExpr.h:216
static VectorType vectorType(CodeGen &codegen, Type etp)
Constructs vector type.
static Dim toDim(const SparseTensorEncodingAttr &enc, unsigned d)
Helper method to translate dim level type to internal representation.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
Definition: Value.cpp:20
void populateSparsificationPatterns(RewritePatternSet &patterns, const SparsificationOptions &options=SparsificationOptions())
Sets up sparsification rewriting rules with the given options.
static Value genSubscript(CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, OpOperand *t, SmallVector< Value, 4 > &args)
Generates subscript for load/store on a dense or sparse tensor.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:55
static Value genVectorMask(CodeGen &codegen, OpBuilder &builder, Value iv, Value lo, Value hi, Value step)
Constructs vector iteration mask.
This class represents an operand of an operation.
Definition: Value.h:251
unsigned index
Indices hold the index number.
Definition: Merger.h:106
unsigned getIntOrFloatBitWidth() const
Return the bit width of an integer or a float type, assert failure on other types.
Definition: Types.cpp:91
AffineExpr getAffineDimExpr(unsigned position)
Definition: Builders.cpp:285
static Value genVectorReducInit(CodeGen &codegen, OpBuilder &builder, Location loc, VectorType vtp)
Generates an initial value for a vector reduction, following the scheme given in Chapter 5 of "The So...
static void genTensorStore(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned exp, Value rhs)
Generates a store on a dense or sparse tensor.
static Value genOutputBuffer(CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, MemRefType denseTp, ArrayRef< Value > args)
Generates buffer for the output tensor.
static vector::CombiningKind getCombiningKind(Reduction kind)
Maps reduction kind to vector::CombiningKind.
void setInsertionPointToEnd(Block *block)
Sets the insertion point to the end of the specified block.
Definition: Builders.h:374
static Value genAddress(CodeGen &codegen, OpBuilder &builder, Location loc, Value size, Value p, Value i)
Generates an address computation "sz * p + i".
static void addAffineOrderings(std::vector< std::vector< bool >> &adjM, AffineExpr a, AffineExpr b, unsigned fidx)
Helper method to add all constraints from the indices in one affine expression before all indices in ...
static Value genTensorLoad(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned exp)
Generates a load on a dense or sparse tensor.
static void genResult(Merger &merger, CodeGen &codegen, RewriterBase &rewriter, linalg::GenericOp op)
Converts the result computed by the sparse kernel into the required form.
unsigned buildLattices(unsigned e, unsigned i)
Builds the iteration lattices in a bottom-up traversal given the remaining tensor (sub)expression and...
Definition: Merger.cpp:611
bool isSingleCondition(unsigned t, unsigned e) const
Returns true if given tensor iterates only in the given tensor expression.
Definition: Merger.cpp:302
Type getIndexOverheadType(Builder &builder, const SparseTensorEncodingAttr &enc)
Returns the mlir::Type for index overhead storage.
Value val
Direct link to IR for an invariant or the destination value (to infer destination type) of a cast ope...
Definition: Merger.h:115
Options for the Sparsification pass.
Definition: Passes.h:65
Block * createBlock(Region *parent, Region::iterator insertPt={}, TypeRange argTypes=llvm::None, ArrayRef< Location > locs=llvm::None)
Add new block with &#39;argTypes&#39; arguments and set the insertion point to the end of it...
Definition: Builders.cpp:353
static Value genVectorLoad(CodeGen &codegen, OpBuilder &builder, Value ptr, ArrayRef< Value > args)
Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi].
static bool startLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, std::vector< unsigned > &topSort, unsigned exp, unsigned at, unsigned idx, unsigned ldx, unsigned lts)
Starts a loop sequence at given level.
bool isa() const
Definition: Types.h:246
Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim)
Helper function that creates a memref::DimOp or tensor::DimOp depending on the type of source...
Definition: Utils.cpp:195
static void genWhileInduction(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, unsigned idx, bool needsUniv, BitVector &induction, scf::WhileOp whileOp)
Generates the induction structure for a while-loop.
void setDim(unsigned t, unsigned i, Dim d)
Dimension setter.
Definition: Merger.h:250
static void visit(Operation *op, DenseSet< Operation *> &visited)
Visits all the pdl.operand(s), pdl.result(s), and pdl.operation(s) connected to the given operation...
Definition: PDL.cpp:62
Value buildExp(RewriterBase &rewriter, Location loc, unsigned e, Value v0, Value v1)
Rebuilds SSA format from a tensor expression.
Definition: Merger.cpp:1069
static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder, linalg::GenericOp op, Operation *loop, unsigned idx, unsigned li, bool needsUniv)
Ends a single loop in current sequence. Returns new values for needsUniv.
bool hasAnyDimOf(const BitVector &bits, Dim d) const
Returns true if any set bit corresponds to queried dim.
Definition: Merger.cpp:295
This class helps build Operations.
Definition: Builders.h:184
This class provides an abstraction over the different types of ranges over Values.
A class to handle all iteration lattice operations.
Definition: Merger.h:148
Dimensional identifier.
Optional< unsigned > buildTensorExpFromLinalg(linalg::GenericOp op)
Builds a tensor expression from the given Linalg operation.
Definition: Merger.cpp:800
MLIRContext * getContext() const
IntegerType getI32Type()
Definition: Builders.cpp:54
This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...
Definition: PatternMatch.h:398
static bool isInvariantAffine(const CodeGen &codegen, AffineExpr a, unsigned ldx, bool &atLevel)
Determines if affine expression is invariant.
U cast() const
Definition: Types.h:262
Children children
Tensor operations hold the indices of their children.
Definition: Merger.h:109