79 ConversionPatternRewriter &rewriter)
const {
83 if (encodeWorkgroupAttributionsAsArguments) {
88 gpuFuncOp.getWorkgroupAttributions();
89 size_t numAttributions = workgroupAttributions.size();
92 unsigned index = gpuFuncOp.getNumArguments();
96 Type workgroupPtrType =
97 rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
102 rewriter.getNamedAttr(LLVM::LLVMDialect::getNoAliasAttrName(),
103 rewriter.getUnitAttr()),
104 rewriter.getNamedAttr(
105 getDialect().getWorkgroupAttributionAttrHelper().getName(),
106 rewriter.getUnitAttr()),
110 auto attributionType = cast<MemRefType>(attribution.getType());
111 IntegerAttr numElements =
112 rewriter.getI64IntegerAttr(attributionType.getNumElements());
113 Type llvmElementType =
115 if (!llvmElementType)
117 TypeAttr type = TypeAttr::get(llvmElementType);
118 attrs.back().setValue(
119 rewriter.getAttr<LLVM::WorkgroupAttributionAttr>(numElements, type));
120 argAttrs.push_back(rewriter.getDictionaryAttr(attrs));
127 rewriter.modifyOpInPlace(
128 gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() {
130 static_cast<FunctionOpInterface
>(gpuFuncOp).insertArguments(
131 argIndices, argTypes, argAttrs, argLocs);
134 "expected GPU funcs to support inserting any argument");
137 workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
138 for (
auto [idx, attribution] :
139 llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
140 auto type = dyn_cast<MemRefType>(attribution.getType());
141 assert(type && type.hasStaticShape() &&
"unexpected type in attribution");
143 uint64_t numElements = type.getNumElements();
146 cast<Type>(typeConverter->convertType(type.getElementType()));
147 auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
149 std::string(llvm::formatv(
"__wg_{0}_{1}", gpuFuncOp.getName(), idx));
150 uint64_t alignment = 0;
151 if (
auto alignAttr = dyn_cast_or_null<IntegerAttr>(
152 gpuFuncOp.getWorkgroupAttributionAttr(
153 idx, LLVM::LLVMDialect::getAlignAttrName())))
154 alignment = alignAttr.getInt();
155 auto globalOp = LLVM::GlobalOp::create(
156 rewriter, gpuFuncOp.getLoc(), arrayType,
false,
157 LLVM::Linkage::Internal, name,
Attribute(), alignment,
159 workgroupBuffers.push_back(globalOp);
164 TypeConverter::SignatureConversion signatureConversion(
165 gpuFuncOp.front().getNumArguments());
168 gpuFuncOp.getFunctionType(),
false,
171 return rewriter.notifyMatchFailure(gpuFuncOp, [&](
Diagnostic &
diag) {
172 diag <<
"failed to convert function signature type for: "
173 << gpuFuncOp.getFunctionType();
181 for (
const auto &attr : gpuFuncOp->getAttrs()) {
183 attr.getName() == gpuFuncOp.getFunctionTypeAttrName() ||
185 gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName() ||
186 attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
187 attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
188 attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
189 attr.getName() == gpuFuncOp.getKnownGridSizeAttrName() ||
190 attr.getName() == gpuFuncOp.getKnownClusterSizeAttrName())
192 if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
193 argAttrs = gpuFuncOp.getArgAttrsAttr();
196 attributes.push_back(attr);
204 auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
206 attributes.emplace_back(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
209 attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
211 if (knownClusterSize)
212 attributes.emplace_back(
213 gpuDialect->getKnownClusterSizeAttrHelper().getName(),
219 if (gpuFuncOp.isKernel()) {
220 if (kernelAttributeName)
221 attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
223 if (kernelBlockSizeAttributeName && knownBlockSize) {
224 attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize);
227 if (kernelClusterSizeAttributeName && knownClusterSize) {
228 attributes.emplace_back(kernelClusterSizeAttributeName, knownClusterSize);
231 LLVM::CConv callingConvention = gpuFuncOp.isKernel()
232 ? kernelCallingConvention
233 : nonKernelCallingConvention;
234 auto llvmFuncOp = LLVM::LLVMFuncOp::create(
235 rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
236 LLVM::Linkage::External,
false, callingConvention,
237 nullptr, attributes);
248 rewriter.setInsertionPointToStart(&gpuFuncOp.front());
249 unsigned numProperArguments = gpuFuncOp.getNumArguments();
251 if (encodeWorkgroupAttributionsAsArguments) {
254 unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions();
255 assert(numProperArguments >= numAttributions &&
256 "Expecting attributions to be encoded as arguments already");
261 gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
263 for (
auto [idx, vals] : llvm::enumerate(llvm::zip_equal(
264 gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) {
265 auto [attribution, arg] = vals;
266 auto type = cast<MemRefType>(attribution.getType());
274 signatureConversion.remapInput(numProperArguments + idx, descr);
277 for (
const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
278 auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
279 global.getAddrSpace());
280 Value address = LLVM::AddressOfOp::create(rewriter, loc, ptrType,
281 global.getSymNameAttr());
283 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getType(),
290 Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
291 auto type = cast<MemRefType>(attribution.
getType());
294 signatureConversion.remapInput(numProperArguments + idx, descr);
299 unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
300 auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
301 for (
const auto [idx, attribution] :
302 llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
303 auto type = cast<MemRefType>(attribution.getType());
304 assert(type && type.hasStaticShape() &&
"unexpected type in attribution");
309 Type elementType = typeConverter->convertType(type.getElementType());
311 LLVM::LLVMPointerType::get(rewriter.getContext(), allocaAddrSpace);
312 Value numElements = LLVM::ConstantOp::create(
313 rewriter, gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
314 uint64_t alignment = 0;
316 dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getPrivateAttributionAttr(
317 idx, LLVM::LLVMDialect::getAlignAttrName())))
318 alignment = alignAttr.getInt();
320 LLVM::AllocaOp::create(rewriter, gpuFuncOp.getLoc(), ptrType,
321 elementType, numElements, alignment);
324 signatureConversion.remapInput(
325 numProperArguments + numWorkgroupAttributions + idx, descr);
330 rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
332 if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
333 &signatureConversion)))
338 for (
const auto [idx, argTy] :
339 llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
340 auto remapping = signatureConversion.getInputMapping(idx);
342 argAttrs ? cast<DictionaryAttr>(argAttrs[idx]) :
NamedAttrList();
343 auto copyAttribute = [&](StringRef attrName) {
347 for (
size_t i = 0, e = remapping->size; i < e; ++i)
348 llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
350 auto copyPointerAttribute = [&](StringRef attrName) {
355 if (remapping->size > 1 &&
356 attrName == LLVM::LLVMDialect::getNoAliasAttrName()) {
358 "Cannot copy noalias with non-bare pointers.\n");
361 for (
size_t i = 0, e = remapping->size; i < e; ++i) {
362 if (isa<LLVM::LLVMPointerType>(
363 llvmFuncOp.getArgument(remapping->inputNo + i).getType())) {
364 llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
372 copyAttribute(LLVM::LLVMDialect::getReturnedAttrName());
373 copyAttribute(LLVM::LLVMDialect::getNoUndefAttrName());
374 copyAttribute(LLVM::LLVMDialect::getInRegAttrName());
375 bool lowersToPointer =
false;
376 for (
size_t i = 0, e = remapping->size; i < e; ++i) {
377 lowersToPointer |= isa<LLVM::LLVMPointerType>(
378 llvmFuncOp.getArgument(remapping->inputNo + i).getType());
381 if (lowersToPointer) {
382 copyPointerAttribute(LLVM::LLVMDialect::getNoAliasAttrName());
383 copyPointerAttribute(LLVM::LLVMDialect::getNoCaptureAttrName());
384 copyPointerAttribute(LLVM::LLVMDialect::getNoFreeAttrName());
385 copyPointerAttribute(LLVM::LLVMDialect::getAlignAttrName());
386 copyPointerAttribute(LLVM::LLVMDialect::getReadonlyAttrName());
387 copyPointerAttribute(LLVM::LLVMDialect::getWriteOnlyAttrName());
388 copyPointerAttribute(LLVM::LLVMDialect::getReadnoneAttrName());
389 copyPointerAttribute(LLVM::LLVMDialect::getNonNullAttrName());
390 copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
391 copyPointerAttribute(
392 LLVM::LLVMDialect::getDereferenceableOrNullAttrName());
393 copyPointerAttribute(
394 LLVM::LLVMDialect::WorkgroupAttributionAttrHelper::getNameStr());
397 rewriter.eraseOp(gpuFuncOp);
402 gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
403 ConversionPatternRewriter &rewriter)
const {
404 Location loc = gpuPrintfOp->getLoc();
406 mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
407 auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
408 mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
409 mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
413 return rewriter.notifyMatchFailure(gpuPrintfOp,
414 "Couldn't find a parent module");
418 LLVM::LLVMFunctionType::get(llvmI64, {llvmI64}));
419 LLVM::LLVMFuncOp ocklAppendArgs;
420 if (!adaptor.getArgs().empty()) {
422 moduleOp, loc, rewriter,
"__ockl_printf_append_args",
423 LLVM::LLVMFunctionType::get(
424 llvmI64, {llvmI64, llvmI32, llvmI64, llvmI64, llvmI64,
425 llvmI64, llvmI64, llvmI64, llvmI64, llvmI32}));
428 moduleOp, loc, rewriter,
"__ockl_printf_append_string_n",
429 LLVM::LLVMFunctionType::get(
431 {llvmI64, ptrType, llvmI64, llvmI32}));
434 Value zeroI64 = LLVM::ConstantOp::create(rewriter, loc, llvmI64, 0);
435 auto printfBeginCall =
436 LLVM::CallOp::create(rewriter, loc, ocklBegin, zeroI64);
437 Value printfDesc = printfBeginCall.getResult();
441 rewriter, loc, moduleOp, llvmI8,
"printfFormat_", adaptor.getFormat());
444 Value globalPtr = LLVM::AddressOfOp::create(
446 LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
447 global.getSymNameAttr());
449 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
451 Value stringLen = LLVM::ConstantOp::create(
452 rewriter, loc, llvmI64, cast<StringAttr>(global.getValueAttr()).size());
454 Value oneI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 1);
455 Value zeroI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 0);
457 auto appendFormatCall = LLVM::CallOp::create(
458 rewriter, loc, ocklAppendStringN,
459 ValueRange{printfDesc, stringStart, stringLen,
460 adaptor.getArgs().empty() ? oneI32 : zeroI32});
461 printfDesc = appendFormatCall.getResult();
464 constexpr size_t argsPerAppend = 7;
465 size_t nArgs = adaptor.getArgs().size();
466 for (
size_t group = 0; group < nArgs; group += argsPerAppend) {
467 size_t bound = std::min(group + argsPerAppend, nArgs);
468 size_t numArgsThisCall = bound - group;
471 arguments.push_back(printfDesc);
473 LLVM::ConstantOp::create(rewriter, loc, llvmI32, numArgsThisCall));
474 for (
size_t i = group; i < bound; ++i) {
475 Value arg = adaptor.getArgs()[i];
476 if (
auto floatType = dyn_cast<FloatType>(arg.
getType())) {
477 if (!floatType.isF64())
478 arg = LLVM::FPExtOp::create(
479 rewriter, loc, typeConverter->convertType(rewriter.getF64Type()),
481 arg = LLVM::BitcastOp::create(rewriter, loc, llvmI64, arg);
484 arg = LLVM::ZExtOp::create(rewriter, loc, llvmI64, arg);
486 arguments.push_back(arg);
489 for (
size_t extra = numArgsThisCall; extra < argsPerAppend; ++extra) {
490 arguments.push_back(zeroI64);
493 auto isLast = (bound == nArgs) ? oneI32 : zeroI32;
494 arguments.push_back(isLast);
495 auto call = LLVM::CallOp::create(rewriter, loc, ocklAppendArgs, arguments);
496 printfDesc = call.getResult();
498 rewriter.eraseOp(gpuPrintfOp);
551 gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
552 ConversionPatternRewriter &rewriter)
const {
553 Location loc = gpuPrintfOp->getLoc();
555 mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
556 mlir::Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
560 return rewriter.notifyMatchFailure(gpuPrintfOp,
561 "Couldn't find a parent module");
569 LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType, ptrType});
571 moduleOp, globalLoc, rewriter,
"vprintf", vprintfType);
574 LLVM::GlobalOp global =
576 "printfFormat_", adaptor.getFormat());
579 Value globalPtr = LLVM::AddressOfOp::create(rewriter, loc, global);
581 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
586 for (
Value arg : adaptor.getArgs()) {
587 Type type = arg.getType();
588 Value promotedArg = arg;
590 if (isa<FloatType>(type)) {
591 type = rewriter.getF64Type();
592 promotedArg = LLVM::FPExtOp::create(rewriter, loc, type, arg);
594 types.push_back(type);
595 args.push_back(promotedArg);
598 LLVM::LLVMStructType::getLiteral(gpuPrintfOp.getContext(), types);
599 Value one = LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(),
600 rewriter.getIndexAttr(1));
602 LLVM::AllocaOp::create(rewriter, loc, ptrType, structType, one,
604 for (
auto [
index, arg] : llvm::enumerate(args)) {
606 rewriter, loc, ptrType, structType, tempAlloc,
608 LLVM::StoreOp::create(rewriter, loc, arg,
ptr);
610 std::array<Value, 2> printfArgs = {stringStart, tempAlloc};
612 LLVM::CallOp::create(rewriter, loc, vprintfDecl, printfArgs);
613 rewriter.eraseOp(gpuPrintfOp);