MLIR  18.0.0git
LowerGpuOpsToNVVMOps.cpp
Go to the documentation of this file.
1 //===- LowerGpuOpsToNVVMOps.cpp - MLIR GPU to NVVM lowering passes --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to generate NVVMIR operations for higher-level
10 // GPU operations.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 
33 
34 #include "../GPUCommon/GPUOpsLowering.h"
35 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
36 #include "../GPUCommon/OpToFuncCallLowering.h"
37 #include <optional>
38 
39 namespace mlir {
40 #define GEN_PASS_DEF_CONVERTGPUOPSTONVVMOPS
41 #include "mlir/Conversion/Passes.h.inc"
42 } // namespace mlir
43 
44 using namespace mlir;
45 
46 namespace {
47 
48 /// Convert gpu dialect shfl mode enum to the equivalent nvvm one.
49 static NVVM::ShflKind convertShflKind(gpu::ShuffleMode mode) {
50  switch (mode) {
51  case gpu::ShuffleMode::XOR:
52  return NVVM::ShflKind::bfly;
53  case gpu::ShuffleMode::UP:
54  return NVVM::ShflKind::up;
55  case gpu::ShuffleMode::DOWN:
56  return NVVM::ShflKind::down;
57  case gpu::ShuffleMode::IDX:
58  return NVVM::ShflKind::idx;
59  }
60  llvm_unreachable("unknown shuffle mode");
61 }
62 
63 static std::optional<NVVM::ReduxKind>
64 convertReduxKind(gpu::AllReduceOperation mode) {
65  switch (mode) {
66  case gpu::AllReduceOperation::ADD:
67  return NVVM::ReduxKind::ADD;
68  case gpu::AllReduceOperation::MUL:
69  return std::nullopt;
70  case gpu::AllReduceOperation::MINSI:
71  return NVVM::ReduxKind::MIN;
72  case gpu::AllReduceOperation::MINUI:
73  return std::nullopt;
74  case gpu::AllReduceOperation::MINF:
75  return NVVM::ReduxKind::MIN;
76  case gpu::AllReduceOperation::MAXSI:
77  return NVVM::ReduxKind::MAX;
78  case gpu::AllReduceOperation::MAXUI:
79  return std::nullopt;
80  case gpu::AllReduceOperation::MAXF:
81  return NVVM::ReduxKind::MAX;
82  case gpu::AllReduceOperation::AND:
83  return NVVM::ReduxKind::AND;
84  case gpu::AllReduceOperation::OR:
85  return NVVM::ReduxKind::OR;
86  case gpu::AllReduceOperation::XOR:
87  return NVVM::ReduxKind::XOR;
88  case gpu::AllReduceOperation::MINIMUMF:
89  case gpu::AllReduceOperation::MAXIMUMF:
90  return std::nullopt;
91  }
92  return std::nullopt;
93 }
94 
95 /// This pass lowers gpu.subgroup_reduce op into to the nvvm.redux op. The op
96 /// must be run by the entire subgroup, otherwise it is undefined behaviour.
97 struct GPUSubgroupReduceOpLowering
98  : public ConvertOpToLLVMPattern<gpu::SubgroupReduceOp> {
101 
102  matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
103  ConversionPatternRewriter &rewriter) const override {
104  if (!op.getUniform())
105  return rewriter.notifyMatchFailure(
106  op, "cannot be lowered to redux as the op must be run "
107  "uniformly (entire subgroup).");
108  if (!op.getValue().getType().isInteger(32))
109  return rewriter.notifyMatchFailure(op, "unsupported data type");
110 
111  std::optional<NVVM::ReduxKind> mode = convertReduxKind(op.getOp());
112  if (!mode.has_value())
113  return rewriter.notifyMatchFailure(
114  op, "unsupported reduction mode for redux");
115 
116  Location loc = op->getLoc();
117  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
118  Value offset = rewriter.create<LLVM::ConstantOp>(loc, int32Type, -1);
119 
120  auto reduxOp = rewriter.create<NVVM::ReduxOp>(loc, int32Type, op.getValue(),
121  mode.value(), offset);
122 
123  rewriter.replaceOp(op, reduxOp->getResult(0));
124  return success();
125  }
126 };
127 
128 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
130 
131  /// Lowers a shuffle to the corresponding NVVM op.
132  ///
133  /// Convert the `width` argument into an activeMask (a bitmask which specifies
134  /// which threads participate in the shuffle) and a maskAndClamp (specifying
135  /// the highest lane which participates in the shuffle).
136  ///
137  /// %one = llvm.constant(1 : i32) : i32
138  /// %minus_one = llvm.constant(-1 : i32) : i32
139  /// %thirty_two = llvm.constant(32 : i32) : i32
140  /// %num_lanes = llvm.sub %thirty_two, %width : i32
141  /// %active_mask = llvm.lshr %minus_one, %num_lanes : i32
142  /// %mask_and_clamp = llvm.sub %width, %one : i32
143  /// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
144  /// %mask_and_clamp : !llvm<"{ float, i1 }">
145  /// %shfl_value = llvm.extractvalue %shfl[0] :
146  /// !llvm<"{ float, i1 }">
147  /// %shfl_pred = llvm.extractvalue %shfl[1] :
148  /// !llvm<"{ float, i1 }">
150  matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
151  ConversionPatternRewriter &rewriter) const override {
152  Location loc = op->getLoc();
153 
154  auto valueTy = adaptor.getValue().getType();
155  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
156  auto predTy = IntegerType::get(rewriter.getContext(), 1);
157  auto resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
158  {valueTy, predTy});
159 
160  Value one = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 1);
161  Value minusOne = rewriter.create<LLVM::ConstantOp>(loc, int32Type, -1);
162  Value thirtyTwo = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 32);
163  Value numLeadInactiveLane = rewriter.create<LLVM::SubOp>(
164  loc, int32Type, thirtyTwo, adaptor.getWidth());
165  // Bit mask of active lanes: `(-1) >> (32 - activeWidth)`.
166  Value activeMask = rewriter.create<LLVM::LShrOp>(loc, int32Type, minusOne,
167  numLeadInactiveLane);
168  Value maskAndClamp;
169  if (op.getMode() == gpu::ShuffleMode::UP) {
170  // Clamp lane: `32 - activeWidth`
171  maskAndClamp = numLeadInactiveLane;
172  } else {
173  // Clamp lane: `activeWidth - 1`
174  maskAndClamp =
175  rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.getWidth(), one);
176  }
177 
178  auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
179  Value shfl = rewriter.create<NVVM::ShflOp>(
180  loc, resultTy, activeMask, adaptor.getValue(), adaptor.getOffset(),
181  maskAndClamp, convertShflKind(op.getMode()), returnValueAndIsValidAttr);
182  Value shflValue = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 0);
183  Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 1);
184 
185  rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
186  return success();
187  }
188 };
189 
190 struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
192 
194  matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
195  ConversionPatternRewriter &rewriter) const override {
196  auto loc = op->getLoc();
197  MLIRContext *context = rewriter.getContext();
198  Value newOp = rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type());
199  // Truncate or extend the result depending on the index bitwidth specified
200  // by the LLVMTypeConverter options.
201  const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
202  if (indexBitwidth > 32) {
203  newOp = rewriter.create<LLVM::SExtOp>(
204  loc, IntegerType::get(context, indexBitwidth), newOp);
205  } else if (indexBitwidth < 32) {
206  newOp = rewriter.create<LLVM::TruncOp>(
207  loc, IntegerType::get(context, indexBitwidth), newOp);
208  }
209  rewriter.replaceOp(op, {newOp});
210  return success();
211  }
212 };
213 
214 /// Import the GPU Ops to NVVM Patterns.
215 #include "GPUToNVVM.cpp.inc"
216 
217 /// A pass that replaces all occurrences of GPU device operations with their
218 /// corresponding NVVM equivalent.
219 ///
220 /// This pass only handles device code and is not meant to be run on GPU host
221 /// code.
222 struct LowerGpuOpsToNVVMOpsPass
223  : public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
224  using Base::Base;
225 
226  void runOnOperation() override {
227  gpu::GPUModuleOp m = getOperation();
228 
229  // Request C wrapper emission.
230  for (auto func : m.getOps<func::FuncOp>()) {
231  func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
233  }
234 
235  // Customize the bitwidth used for the device side index computations.
237  m.getContext(),
238  DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
239  if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
240  options.overrideIndexBitwidth(indexBitwidth);
241  options.useBarePtrCallConv = useBarePtrCallConv;
242 
243  // Apply in-dialect lowering. In-dialect lowering will replace
244  // ops which need to be lowered further, which is not supported by a
245  // single conversion pass.
246  {
247  RewritePatternSet patterns(m.getContext());
248  populateGpuRewritePatterns(patterns);
249  if (failed(applyPatternsAndFoldGreedily(m, std::move(patterns))))
250  return signalPassFailure();
251  }
252 
253  LLVMTypeConverter converter(m.getContext(), options);
254  // NVVM uses alloca in the default address space to represent private
255  // memory allocations, so drop private annotations. NVVM uses address
256  // space 3 for shared memory. NVVM uses the default address space to
257  // represent global memory.
259  converter, [](gpu::AddressSpace space) -> unsigned {
260  switch (space) {
261  case gpu::AddressSpace::Global:
262  return static_cast<unsigned>(
264  case gpu::AddressSpace::Workgroup:
265  return static_cast<unsigned>(
267  case gpu::AddressSpace::Private:
268  return 0;
269  }
270  llvm_unreachable("unknown address space enum value");
271  return 0;
272  });
273  // Lowering for MMAMatrixType.
274  converter.addConversion([&](gpu::MMAMatrixType type) -> Type {
275  return convertMMAToLLVMType(type);
276  });
277  RewritePatternSet llvmPatterns(m.getContext());
278 
279  arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
280  cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
281  populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
282  populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
283  populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
284  populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);
285  if (this->hasRedux)
286  populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns);
289  if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
290  signalPassFailure();
291  }
292 };
293 
294 } // namespace
295 
297  target.addIllegalOp<func::FuncOp>();
298  target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
299  target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
300  target.addIllegalDialect<gpu::GPUDialect>();
301  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
302  LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
303  LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
304  LLVM::SqrtOp>();
305 
306  // TODO: Remove once we support replacing non-root ops.
307  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
308 }
309 
310 template <typename OpTy>
311 static void populateOpPatterns(LLVMTypeConverter &converter,
312  RewritePatternSet &patterns, StringRef f32Func,
313  StringRef f64Func) {
314  patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
315  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);
316 }
317 
319  LLVMTypeConverter &converter, RewritePatternSet &patterns) {
320  patterns.add<GPUSubgroupReduceOpLowering>(converter);
321 }
322 
324  RewritePatternSet &patterns) {
325  populateWithGenerated(patterns);
326  patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
327  patterns.add<
328  GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
329  NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
330  GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
331  NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
332  GPUIndexIntrinsicOpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
333  NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
334  GPUIndexIntrinsicOpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
335  NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
336  GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
337  NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
338  GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
339  NVVM::GridDimYOp, NVVM::GridDimZOp>,
340  GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
341 
344 
345  // Explicitly drop memory space when lowering private memory
346  // attributions since NVVM models it as `alloca`s in the default
347  // memory space and does not support `alloca`s with addrspace(5).
348  patterns.add<GPUFuncOpLowering>(
349  converter, /*allocaAddrSpace=*/0,
350  /*workgroupAddrSpace=*/
351  static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
352  StringAttr::get(&converter.getContext(),
353  NVVM::NVVMDialect::getKernelFuncAttrName()));
354 
355  populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
356  "__nv_fabs");
357  populateOpPatterns<math::AtanOp>(converter, patterns, "__nv_atanf",
358  "__nv_atan");
359  populateOpPatterns<math::Atan2Op>(converter, patterns, "__nv_atan2f",
360  "__nv_atan2");
361  populateOpPatterns<math::CbrtOp>(converter, patterns, "__nv_cbrtf",
362  "__nv_cbrt");
363  populateOpPatterns<math::CeilOp>(converter, patterns, "__nv_ceilf",
364  "__nv_ceil");
365  populateOpPatterns<math::CosOp>(converter, patterns, "__nv_cosf", "__nv_cos");
366  populateOpPatterns<math::ExpOp>(converter, patterns, "__nv_expf", "__nv_exp");
367  populateOpPatterns<math::Exp2Op>(converter, patterns, "__nv_exp2f",
368  "__nv_exp2");
369  populateOpPatterns<math::ExpM1Op>(converter, patterns, "__nv_expm1f",
370  "__nv_expm1");
371  populateOpPatterns<math::FloorOp>(converter, patterns, "__nv_floorf",
372  "__nv_floor");
373  populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
374  "__nv_fmod");
375  populateOpPatterns<math::LogOp>(converter, patterns, "__nv_logf", "__nv_log");
376  populateOpPatterns<math::Log1pOp>(converter, patterns, "__nv_log1pf",
377  "__nv_log1p");
378  populateOpPatterns<math::Log10Op>(converter, patterns, "__nv_log10f",
379  "__nv_log10");
380  populateOpPatterns<math::Log2Op>(converter, patterns, "__nv_log2f",
381  "__nv_log2");
382  populateOpPatterns<math::PowFOp>(converter, patterns, "__nv_powf",
383  "__nv_pow");
384  populateOpPatterns<math::RsqrtOp>(converter, patterns, "__nv_rsqrtf",
385  "__nv_rsqrt");
386  populateOpPatterns<math::SinOp>(converter, patterns, "__nv_sinf", "__nv_sin");
387  populateOpPatterns<math::SqrtOp>(converter, patterns, "__nv_sqrtf",
388  "__nv_sqrt");
389  populateOpPatterns<math::TanhOp>(converter, patterns, "__nv_tanhf",
390  "__nv_tanh");
391  populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan");
392 }
static constexpr int64_t kSharedMemorySpace
static MLIRContext * getContext(OpFoldResult val)
static void populateOpPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef f32Func, StringRef f64Func)
static llvm::ManagedStatic< PassManagerOptions > options
UnitAttr getUnitAttr()
Definition: Builders.cpp:114
IntegerType getI32Type()
Definition: Builders.cpp:83
MLIRContext * getContext() const
Definition: Builders.h:55
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
PatternRewriter hook for replacing an operation.
LogicalResult notifyMatchFailure(Location loc, function_ref< void(Diagnostic &)> reasonCallback) override
PatternRewriter hook for notifying match failure reasons.
This class describes a specific conversion target.
void addLegalOp(OperationName op)
Register the given operations as legal.
void addLegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as legal.
void addIllegalDialect(StringRef name, Names... names)
Register the operations of the given dialects as illegal, i.e.
void addIllegalOp(OperationName op)
Register the given operation as illegal, i.e.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
Definition: Pattern.h:139
The main mechanism for performing data layout queries.
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
Definition: TypeConverter.h:33
MLIRContext & getContext() const
Returns the MLIR context.
static LLVMStructType getLiteral(MLIRContext *context, ArrayRef< Type > types, bool isPacked=false)
Gets or creates a literal struct with the given body in the provided context.
Definition: LLVMTypes.cpp:436
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:63
Options to control the LLVM lowering.
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Definition: Builders.cpp:446
Location getLoc()
The source location the operation was defined or derived from.
Definition: Operation.h:223
RewritePatternSet & add(ConstructorArg &&arg, ConstructorArgs &&...args)
Add an instance of each of the pattern types 'Ts' to the pattern list with the given arguments.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
Definition: Types.h:74
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Definition: Value.h:96
MMAMatrix represents a matrix held by a subgroup for matrix-matrix multiply accumulate operations.
Definition: GPUDialect.h:127
constexpr int kSharedMemoryAlignmentBit
Definition: NVVMDialect.h:31
@ kGlobalMemorySpace
Global memory space identifier.
Definition: NVVMDialect.h:36
void populateArithToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
void populateControlFlowToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect the patterns to convert from the ControlFlow dialect to LLVM.
Include the generated interface declarations.
void populateFuncToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, const SymbolTable *symbolTable=nullptr)
Collect the patterns to convert from the Func dialect to LLVM.
Definition: FuncToLLVM.cpp:752
LLVM::LLVMStructType convertMMAToLLVMType(gpu::MMAMatrixType type)
Return the LLVMStructureType corresponding to the MMAMatrixType type.
static constexpr unsigned kDeriveIndexBitwidthFromDataLayout
Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, DenseSet< Operation * > *unconvertedOps=nullptr)
Below we define several entry points for operation conversion.
void populateGpuRewritePatterns(RewritePatternSet &patterns)
Collect all patterns to rewrite ops within the GPU dialect.
Definition: Passes.h:66
void populateFinalizeMemRefToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert memory-related operations from the MemRef dialect to the LLVM di...
LogicalResult success(bool isSuccess=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:56
void configureGpuToNVVMConversionLegality(ConversionTarget &target)
Configure target to convert from the GPU dialect to NVVM.
LogicalResult applyPatternsAndFoldGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)
Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...
void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)
Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Collect a set of patterns to convert from the GPU dialect to NVVM.
void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter, RewritePatternSet &patterns)
Populate GpuSubgroupReduce pattern to NVVM.
bool failed(LogicalResult result)
Utility function that returns true if the provided LogicalResult corresponds to a failure value.
Definition: LogicalResult.h:72
Lowering for gpu.dynamic.shared.memory to LLVM dialect.
Lowering of gpu.printf to a vprintf standard library.
This class represents an efficient way to signal success or failure.
Definition: LogicalResult.h:26
Rewriting that replace SourceOp with a CallOp to f32Func or f64Func depending on the element type tha...
Rewriting that unrolls SourceOp to scalars if it's operating on vectors.