79 ConversionPatternRewriter &rewriter)
const {
83 if (encodeWorkgroupAttributionsAsArguments) {
88 gpuFuncOp.getWorkgroupAttributions();
89 size_t numAttributions = workgroupAttributions.size();
92 unsigned index = gpuFuncOp.getNumArguments();
96 Type workgroupPtrType =
97 rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
102 rewriter.getNamedAttr(LLVM::LLVMDialect::getNoAliasAttrName(),
103 rewriter.getUnitAttr()),
104 rewriter.getNamedAttr(
105 getDialect().getWorkgroupAttributionAttrHelper().getName(),
106 rewriter.getUnitAttr()),
110 auto attributionType = cast<MemRefType>(attribution.getType());
111 IntegerAttr numElements =
112 rewriter.getI64IntegerAttr(attributionType.getNumElements());
113 Type llvmElementType =
115 if (!llvmElementType)
117 TypeAttr type = TypeAttr::get(llvmElementType);
118 attrs.back().setValue(
119 rewriter.getAttr<LLVM::WorkgroupAttributionAttr>(numElements, type));
120 argAttrs.push_back(rewriter.getDictionaryAttr(attrs));
127 rewriter.modifyOpInPlace(
128 gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() {
130 static_cast<FunctionOpInterface
>(gpuFuncOp).insertArguments(
131 argIndices, argTypes, argAttrs, argLocs);
134 "expected GPU funcs to support inserting any argument");
137 workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
138 for (
auto [idx, attribution] :
139 llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
140 auto type = dyn_cast<MemRefType>(attribution.getType());
141 assert(type && type.hasStaticShape() &&
"unexpected type in attribution");
143 uint64_t numElements = type.getNumElements();
146 cast<Type>(typeConverter->convertType(type.getElementType()));
147 auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
149 std::string(llvm::formatv(
"__wg_{0}_{1}", gpuFuncOp.getName(), idx));
150 uint64_t alignment = 0;
151 if (
auto alignAttr = dyn_cast_or_null<IntegerAttr>(
152 gpuFuncOp.getWorkgroupAttributionAttr(
153 idx, LLVM::LLVMDialect::getAlignAttrName())))
154 alignment = alignAttr.getInt();
155 auto globalOp = LLVM::GlobalOp::create(
156 rewriter, gpuFuncOp.getLoc(), arrayType,
false,
157 LLVM::Linkage::Internal, name,
Attribute(), alignment,
159 workgroupBuffers.push_back(globalOp);
164 TypeConverter::SignatureConversion signatureConversion(
165 gpuFuncOp.front().getNumArguments());
168 gpuFuncOp.getFunctionType(),
false,
171 return rewriter.notifyMatchFailure(gpuFuncOp, [&](
Diagnostic &
diag) {
172 diag <<
"failed to convert function signature type for: "
173 << gpuFuncOp.getFunctionType();
181 for (
const auto &attr : gpuFuncOp->getAttrs()) {
183 attr.getName() == gpuFuncOp.getFunctionTypeAttrName() ||
185 gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName() ||
186 attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
187 attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
188 attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
189 attr.getName() == gpuFuncOp.getKnownGridSizeAttrName())
191 if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
192 argAttrs = gpuFuncOp.getArgAttrsAttr();
195 attributes.push_back(attr);
202 auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
204 attributes.emplace_back(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
207 attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
213 if (gpuFuncOp.isKernel()) {
214 if (kernelAttributeName)
215 attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
217 if (kernelBlockSizeAttributeName && knownBlockSize) {
218 attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize);
221 LLVM::CConv callingConvention = gpuFuncOp.isKernel()
222 ? kernelCallingConvention
223 : nonKernelCallingConvention;
224 auto llvmFuncOp = LLVM::LLVMFuncOp::create(
225 rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
226 LLVM::Linkage::External,
false, callingConvention,
227 nullptr, attributes);
238 rewriter.setInsertionPointToStart(&gpuFuncOp.front());
239 unsigned numProperArguments = gpuFuncOp.getNumArguments();
241 if (encodeWorkgroupAttributionsAsArguments) {
244 unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions();
245 assert(numProperArguments >= numAttributions &&
246 "Expecting attributions to be encoded as arguments already");
251 gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
253 for (
auto [idx, vals] : llvm::enumerate(llvm::zip_equal(
254 gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) {
255 auto [attribution, arg] = vals;
256 auto type = cast<MemRefType>(attribution.getType());
264 signatureConversion.remapInput(numProperArguments + idx, descr);
267 for (
const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
268 auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
269 global.getAddrSpace());
270 Value address = LLVM::AddressOfOp::create(rewriter, loc, ptrType,
271 global.getSymNameAttr());
273 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getType(),
280 Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
281 auto type = cast<MemRefType>(attribution.
getType());
284 signatureConversion.remapInput(numProperArguments + idx, descr);
289 unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
290 auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
291 for (
const auto [idx, attribution] :
292 llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
293 auto type = cast<MemRefType>(attribution.getType());
294 assert(type && type.hasStaticShape() &&
"unexpected type in attribution");
299 Type elementType = typeConverter->convertType(type.getElementType());
301 LLVM::LLVMPointerType::get(rewriter.getContext(), allocaAddrSpace);
302 Value numElements = LLVM::ConstantOp::create(
303 rewriter, gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
304 uint64_t alignment = 0;
306 dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getPrivateAttributionAttr(
307 idx, LLVM::LLVMDialect::getAlignAttrName())))
308 alignment = alignAttr.getInt();
310 LLVM::AllocaOp::create(rewriter, gpuFuncOp.getLoc(), ptrType,
311 elementType, numElements, alignment);
314 signatureConversion.remapInput(
315 numProperArguments + numWorkgroupAttributions + idx, descr);
320 rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
322 if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
323 &signatureConversion)))
328 for (
const auto [idx, argTy] :
329 llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
330 auto remapping = signatureConversion.getInputMapping(idx);
332 argAttrs ? cast<DictionaryAttr>(argAttrs[idx]) :
NamedAttrList();
333 auto copyAttribute = [&](StringRef attrName) {
337 for (
size_t i = 0, e = remapping->size; i < e; ++i)
338 llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
340 auto copyPointerAttribute = [&](StringRef attrName) {
345 if (remapping->size > 1 &&
346 attrName == LLVM::LLVMDialect::getNoAliasAttrName()) {
348 "Cannot copy noalias with non-bare pointers.\n");
351 for (
size_t i = 0, e = remapping->size; i < e; ++i) {
352 if (isa<LLVM::LLVMPointerType>(
353 llvmFuncOp.getArgument(remapping->inputNo + i).getType())) {
354 llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
362 copyAttribute(LLVM::LLVMDialect::getReturnedAttrName());
363 copyAttribute(LLVM::LLVMDialect::getNoUndefAttrName());
364 copyAttribute(LLVM::LLVMDialect::getInRegAttrName());
365 bool lowersToPointer =
false;
366 for (
size_t i = 0, e = remapping->size; i < e; ++i) {
367 lowersToPointer |= isa<LLVM::LLVMPointerType>(
368 llvmFuncOp.getArgument(remapping->inputNo + i).getType());
371 if (lowersToPointer) {
372 copyPointerAttribute(LLVM::LLVMDialect::getNoAliasAttrName());
373 copyPointerAttribute(LLVM::LLVMDialect::getNoCaptureAttrName());
374 copyPointerAttribute(LLVM::LLVMDialect::getNoFreeAttrName());
375 copyPointerAttribute(LLVM::LLVMDialect::getAlignAttrName());
376 copyPointerAttribute(LLVM::LLVMDialect::getReadonlyAttrName());
377 copyPointerAttribute(LLVM::LLVMDialect::getWriteOnlyAttrName());
378 copyPointerAttribute(LLVM::LLVMDialect::getReadnoneAttrName());
379 copyPointerAttribute(LLVM::LLVMDialect::getNonNullAttrName());
380 copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
381 copyPointerAttribute(
382 LLVM::LLVMDialect::getDereferenceableOrNullAttrName());
383 copyPointerAttribute(
384 LLVM::LLVMDialect::WorkgroupAttributionAttrHelper::getNameStr());
387 rewriter.eraseOp(gpuFuncOp);
392 gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
393 ConversionPatternRewriter &rewriter)
const {
394 Location loc = gpuPrintfOp->getLoc();
396 mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
397 auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
398 mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
399 mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
403 return rewriter.notifyMatchFailure(gpuPrintfOp,
404 "Couldn't find a parent module");
408 LLVM::LLVMFunctionType::get(llvmI64, {llvmI64}));
409 LLVM::LLVMFuncOp ocklAppendArgs;
410 if (!adaptor.getArgs().empty()) {
412 moduleOp, loc, rewriter,
"__ockl_printf_append_args",
413 LLVM::LLVMFunctionType::get(
414 llvmI64, {llvmI64, llvmI32, llvmI64, llvmI64, llvmI64,
415 llvmI64, llvmI64, llvmI64, llvmI64, llvmI32}));
418 moduleOp, loc, rewriter,
"__ockl_printf_append_string_n",
419 LLVM::LLVMFunctionType::get(
421 {llvmI64, ptrType, llvmI64, llvmI32}));
424 Value zeroI64 = LLVM::ConstantOp::create(rewriter, loc, llvmI64, 0);
425 auto printfBeginCall =
426 LLVM::CallOp::create(rewriter, loc, ocklBegin, zeroI64);
427 Value printfDesc = printfBeginCall.getResult();
431 rewriter, loc, moduleOp, llvmI8,
"printfFormat_", adaptor.getFormat());
434 Value globalPtr = LLVM::AddressOfOp::create(
436 LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
437 global.getSymNameAttr());
439 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
441 Value stringLen = LLVM::ConstantOp::create(
442 rewriter, loc, llvmI64, cast<StringAttr>(global.getValueAttr()).size());
444 Value oneI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 1);
445 Value zeroI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 0);
447 auto appendFormatCall = LLVM::CallOp::create(
448 rewriter, loc, ocklAppendStringN,
449 ValueRange{printfDesc, stringStart, stringLen,
450 adaptor.getArgs().empty() ? oneI32 : zeroI32});
451 printfDesc = appendFormatCall.getResult();
454 constexpr size_t argsPerAppend = 7;
455 size_t nArgs = adaptor.getArgs().size();
456 for (
size_t group = 0; group < nArgs; group += argsPerAppend) {
457 size_t bound = std::min(group + argsPerAppend, nArgs);
458 size_t numArgsThisCall = bound - group;
461 arguments.push_back(printfDesc);
463 LLVM::ConstantOp::create(rewriter, loc, llvmI32, numArgsThisCall));
464 for (
size_t i = group; i < bound; ++i) {
465 Value arg = adaptor.getArgs()[i];
466 if (
auto floatType = dyn_cast<FloatType>(arg.
getType())) {
467 if (!floatType.isF64())
468 arg = LLVM::FPExtOp::create(
469 rewriter, loc, typeConverter->convertType(rewriter.getF64Type()),
471 arg = LLVM::BitcastOp::create(rewriter, loc, llvmI64, arg);
474 arg = LLVM::ZExtOp::create(rewriter, loc, llvmI64, arg);
476 arguments.push_back(arg);
479 for (
size_t extra = numArgsThisCall; extra < argsPerAppend; ++extra) {
480 arguments.push_back(zeroI64);
483 auto isLast = (bound == nArgs) ? oneI32 : zeroI32;
484 arguments.push_back(isLast);
485 auto call = LLVM::CallOp::create(rewriter, loc, ocklAppendArgs, arguments);
486 printfDesc = call.getResult();
488 rewriter.eraseOp(gpuPrintfOp);
541 gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
542 ConversionPatternRewriter &rewriter)
const {
543 Location loc = gpuPrintfOp->getLoc();
545 mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
546 mlir::Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
550 return rewriter.notifyMatchFailure(gpuPrintfOp,
551 "Couldn't find a parent module");
559 LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType, ptrType});
561 moduleOp, globalLoc, rewriter,
"vprintf", vprintfType);
564 LLVM::GlobalOp global =
566 "printfFormat_", adaptor.getFormat());
569 Value globalPtr = LLVM::AddressOfOp::create(rewriter, loc, global);
571 LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
576 for (
Value arg : adaptor.getArgs()) {
577 Type type = arg.getType();
578 Value promotedArg = arg;
580 if (isa<FloatType>(type)) {
581 type = rewriter.getF64Type();
582 promotedArg = LLVM::FPExtOp::create(rewriter, loc, type, arg);
584 types.push_back(type);
585 args.push_back(promotedArg);
588 LLVM::LLVMStructType::getLiteral(gpuPrintfOp.getContext(), types);
589 Value one = LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(),
590 rewriter.getIndexAttr(1));
592 LLVM::AllocaOp::create(rewriter, loc, ptrType, structType, one,
594 for (
auto [
index, arg] : llvm::enumerate(args)) {
596 rewriter, loc, ptrType, structType, tempAlloc,
598 LLVM::StoreOp::create(rewriter, loc, arg,
ptr);
600 std::array<Value, 2> printfArgs = {stringStart, tempAlloc};
602 LLVM::CallOp::create(rewriter, loc, vprintfDecl, printfArgs);
603 rewriter.eraseOp(gpuPrintfOp);