22 #include "../LLVMCommon/MemRefDescriptor.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/ADT/TypeSwitch.h"
26 #include "llvm/Support/Casting.h"
30 #define GEN_PASS_DEF_CONVERTAMDGPUTOROCDLPASS
31 #include "mlir/Conversion/Passes.h.inc"
48 auto valTy = cast<IntegerType>(val.
getType());
51 return valTy.getWidth() > 32
52 ?
Value(rewriter.
create<LLVM::TruncOp>(loc, i32, val))
53 :
Value(rewriter.
create<LLVM::ZExtOp>(loc, i32, val));
59 return rewriter.
create<LLVM::ConstantOp>(loc, i32, value);
65 return rewriter.
create<LLVM::ConstantOp>(loc, llvmI1, value);
77 ShapedType::isDynamic(stride)
79 memRefDescriptor.
stride(rewriter, loc, i))
80 : rewriter.
create<LLVM::ConstantOp>(loc, i32, stride);
81 increment = rewriter.
create<LLVM::MulOp>(loc, increment, strideValue);
84 index ? rewriter.
create<LLVM::AddOp>(loc, index, increment) : increment;
93 MemRefType memrefType,
96 uint32_t elementByteWidth) {
97 if (memrefType.hasStaticShape() &&
98 !llvm::any_of(strides, ShapedType::isDynamic)) {
99 int64_t size = memrefType.getRank() == 0 ? 1 : 0;
101 for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i)
102 size =
std::max(shape[i] * strides[i], size);
103 size = size * elementByteWidth;
105 "the memref buffer is too large");
109 for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i) {
110 Value size = memrefDescriptor.
size(rewriter, loc, i);
111 Value stride = memrefDescriptor.
stride(rewriter, loc, i);
112 Value maxThisDim = rewriter.
create<LLVM::MulOp>(loc, size, stride);
114 ? rewriter.
create<LLVM::UMaxOp>(loc, maxIndex, maxThisDim)
119 return rewriter.
create<LLVM::MulOp>(loc, maxIndexI32, byteWidthConst);
124 bool boundsCheck, amdgpu::Chipset chipset,
125 Value cacheSwizzleStride =
nullptr,
126 unsigned addressSpace = 8) {
132 if (chipset.majorVersion == 9 && chipset >=
kGfx942 && cacheSwizzleStride) {
133 Value cacheStrideZext =
134 rewriter.
create<LLVM::ZExtOp>(loc, i16, cacheSwizzleStride);
135 Value swizzleBit = rewriter.
create<LLVM::ConstantOp>(
137 stride = rewriter.
create<LLVM::OrOp>(loc, cacheStrideZext, swizzleBit,
140 stride = rewriter.
create<LLVM::ConstantOp>(loc, i16,
158 uint32_t flags = (7 << 12) | (4 << 15);
159 if (chipset.majorVersion >= 10) {
161 uint32_t oob = boundsCheck ? 3 : 2;
162 flags |= (oob << 28);
168 loc, rsrcType, basePointer, stride, numRecords, flagsConst);
173 struct FatRawBufferCastLowering
182 matchAndRewrite(FatRawBufferCastOp op, FatRawBufferCastOpAdaptor adaptor,
185 Value memRef = adaptor.getSource();
186 Value unconvertedMemref = op.getSource();
187 MemRefType memrefType = cast<MemRefType>(unconvertedMemref.
getType());
191 int64_t elementByteWidth =
194 int64_t unusedOffset = 0;
196 if (failed(memrefType.getStridesAndOffset(strideVals, unusedOffset)))
197 return op.emitOpError(
"Can't lower non-stride-offset memrefs");
199 Value numRecords = adaptor.getValidBytes();
201 numRecords =
getNumRecords(rewriter, loc, memrefType, descriptor,
202 strideVals, elementByteWidth);
205 adaptor.getResetOffset()
206 ? descriptor.bufferPtr(rewriter, loc, *getTypeConverter(),
208 : descriptor.alignedPtr(rewriter, loc);
210 Value offset = adaptor.getResetOffset()
211 ? rewriter.
create<LLVM::ConstantOp>(
213 : descriptor.offset(rewriter, loc);
215 bool hasSizes = memrefType.getRank() > 0;
218 Value sizes = hasSizes ? rewriter.
create<LLVM::ExtractValueOp>(
221 Value strides = hasSizes
222 ? rewriter.
create<LLVM::ExtractValueOp>(
227 rewriter, loc, basePointer, numRecords, adaptor.getBoundsCheck(),
228 chipset, adaptor.getCacheSwizzleStride(), 7);
232 getTypeConverter()->convertType(op.getResult().getType()));
233 result = rewriter.
create<LLVM::InsertValueOp>(
235 result = rewriter.
create<LLVM::InsertValueOp>(
237 result = rewriter.
create<LLVM::InsertValueOp>(loc, result, offset,
240 result = rewriter.
create<LLVM::InsertValueOp>(loc, result, sizes,
242 result = rewriter.
create<LLVM::InsertValueOp>(
251 template <
typename GpuOp,
typename Intrinsic>
257 static constexpr uint32_t maxVectorOpWidth = 128;
260 matchAndRewrite(GpuOp gpuOp,
typename GpuOp::Adaptor adaptor,
263 Value memref = adaptor.getMemref();
264 Value unconvertedMemref = gpuOp.getMemref();
265 MemRefType memrefType = cast<MemRefType>(unconvertedMemref.
getType());
268 return gpuOp.emitOpError(
"raw buffer ops require GCN or higher");
270 Value storeData = adaptor.getODSOperands(0)[0];
271 if (storeData == memref)
275 wantedDataType = storeData.
getType();
277 wantedDataType = gpuOp.getODSResults(0)[0].getType();
282 Value maybeCmpData = adaptor.getODSOperands(1)[0];
283 if (maybeCmpData != memref)
284 atomicCmpData = maybeCmpData;
287 Type llvmWantedDataType = this->typeConverter->convertType(wantedDataType);
293 int64_t elementByteWidth =
302 Type llvmBufferValType = llvmWantedDataType;
304 if (
auto floatType = dyn_cast<FloatType>(wantedDataType))
305 llvmBufferValType = this->getTypeConverter()->convertType(
308 if (
auto dataVector = dyn_cast<VectorType>(wantedDataType)) {
309 uint32_t vecLen = dataVector.getNumElements();
312 uint32_t totalBits = elemBits * vecLen;
314 isa_and_present<RawBufferAtomicFaddOp>(*gpuOp) && vecLen == 2;
315 if (totalBits > maxVectorOpWidth)
316 return gpuOp.emitOpError(
317 "Total width of loads or stores must be no more than " +
318 Twine(maxVectorOpWidth) +
" bits, but we call for " +
320 " bits. This should've been caught in validation");
321 if (!usePackedFp16 && elemBits < 32) {
322 if (totalBits > 32) {
323 if (totalBits % 32 != 0)
324 return gpuOp.emitOpError(
"Load or store of more than 32-bits that "
325 "doesn't fit into words. Can't happen\n");
326 llvmBufferValType = this->typeConverter->convertType(
329 llvmBufferValType = this->typeConverter->convertType(
334 if (
auto vecType = dyn_cast<VectorType>(llvmBufferValType)) {
337 if (vecType.getNumElements() == 1)
338 llvmBufferValType = vecType.getElementType();
343 if (llvmBufferValType != llvmWantedDataType) {
345 rewriter.
create<LLVM::BitcastOp>(loc, llvmBufferValType, storeData);
346 args.push_back(castForStore);
348 args.push_back(storeData);
353 if (llvmBufferValType != llvmWantedDataType) {
354 Value castForCmp = rewriter.
create<LLVM::BitcastOp>(
355 loc, llvmBufferValType, atomicCmpData);
356 args.push_back(castForCmp);
358 args.push_back(atomicCmpData);
365 if (failed(memrefType.getStridesAndOffset(strides, offset)))
366 return gpuOp.emitOpError(
"Can't lower non-stride-offset memrefs");
370 Value ptr = memrefDescriptor.bufferPtr(
371 rewriter, loc, *this->getTypeConverter(), memrefType);
373 rewriter, loc, memrefType, memrefDescriptor, strides, elementByteWidth);
375 adaptor.getBoundsCheck(), chipset);
376 args.push_back(resource);
380 adaptor.getIndices(), strides);
381 if (std::optional<int32_t> indexOffset = adaptor.getIndexOffset();
382 indexOffset && *indexOffset > 0) {
385 voffset ? rewriter.
create<LLVM::AddOp>(loc, voffset, extraOffsetConst)
388 voffset = rewriter.
create<LLVM::MulOp>(loc, voffset, byteWidthConst);
389 args.push_back(voffset);
392 Value sgprOffset = adaptor.getSgprOffset();
395 sgprOffset = rewriter.
create<LLVM::MulOp>(loc, sgprOffset, byteWidthConst);
396 args.push_back(sgprOffset);
405 Operation *lowered = rewriter.
create<Intrinsic>(loc, resultTypes, args,
409 if (llvmBufferValType != llvmWantedDataType) {
410 replacement = rewriter.
create<LLVM::BitcastOp>(loc, llvmWantedDataType,
428 matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
432 if (requiresInlineAsm) {
434 LLVM::AsmDialect::AD_ATT);
436 ";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier";
437 const char *constraints =
"";
441 asmStr, constraints,
true,
442 false, asmDialectAttr,
447 constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8);
448 constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8);
451 constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4);
455 ldsOnlyBits = ldsOnlyBitsGfx11;
457 ldsOnlyBits = ldsOnlyBitsGfx10;
459 ldsOnlyBits = ldsOnlyBitsGfx6789;
461 return op.emitOpError(
462 "don't know how to lower this for chipset major version")
466 rewriter.
create<ROCDL::SWaitcntOp>(loc, ldsOnlyBits);
470 rewriter.
create<ROCDL::WaitDscntOp>(loc, 0);
471 rewriter.
create<ROCDL::BarrierSignalOp>(loc, -1);
486 matchAndRewrite(SchedBarrierOp op, SchedBarrierOp::Adaptor adaptor,
489 (uint32_t)op.getOpts());
512 if (
auto vectorType = dyn_cast<VectorType>(inputType)) {
513 if (vectorType.getElementType().isBF16())
514 return rewriter.
create<LLVM::BitcastOp>(
515 loc, vectorType.clone(rewriter.
getI16Type()), input);
516 if (vectorType.getElementType().isInteger(8) &&
517 vectorType.getNumElements() <= 8)
518 return rewriter.
create<LLVM::BitcastOp>(
519 loc, rewriter.
getIntegerType(vectorType.getNumElements() * 8), input);
520 if (isa<IntegerType>(vectorType.getElementType()) &&
521 vectorType.getElementTypeBitWidth() <= 8) {
523 vectorType.getNumElements() * vectorType.getElementTypeBitWidth(),
525 return rewriter.
create<LLVM::BitcastOp>(
546 if (
auto intType = dyn_cast<IntegerType>(inputType))
547 return rewriter.
create<LLVM::ZExtOp>(loc, outputType, input);
548 return rewriter.
create<LLVM::BitcastOp>(loc, outputType, input);
562 bool isUnsigned,
Value llvmInput,
566 auto vectorType = dyn_cast<VectorType>(inputType);
568 operands.push_back(llvmInput);
571 Type elemType = vectorType.getElementType();
574 llvmInput = rewriter.
create<LLVM::BitcastOp>(
575 loc, vectorType.clone(rewriter.
getI16Type()), llvmInput);
577 operands.push_back(llvmInput);
584 auto mlirInputType = cast<VectorType>(mlirInput.
getType());
585 bool isInputInteger = mlirInputType.getElementType().isInteger();
586 if (isInputInteger) {
588 bool localIsUnsigned = isUnsigned;
590 localIsUnsigned =
true;
592 localIsUnsigned =
false;
595 operands.push_back(sign);
601 Type intrinsicInType = numBits <= 32
604 auto llvmIntrinsicInType = typeConverter->
convertType(intrinsicInType);
606 loc, llvmIntrinsicInType, llvmInput);
611 castInput = rewriter.
create<LLVM::ZExtOp>(loc, i32, castInput);
612 operands.push_back(castInput);
625 Value output, int32_t subwordOffset,
628 auto vectorType = dyn_cast<VectorType>(inputType);
629 Type elemType = vectorType.getElementType();
631 output = rewriter.
create<LLVM::BitcastOp>(
632 loc, vectorType.clone(rewriter.
getI16Type()), output);
633 operands.push_back(output);
644 return (chipset ==
kGfx942 && isa<Float8E5M2FNUZType>(type)) ||
645 (
hasOcpFp8(chipset) && isa<Float8E5M2Type>(type));
651 return (chipset ==
kGfx942 && isa<Float8E4M3FNUZType>(type)) ||
652 (
hasOcpFp8(chipset) && isa<Float8E4M3FNType>(type));
660 uint32_t m = mfma.getM(), n = mfma.getN(), k = mfma.getK(),
661 b = mfma.getBlocks();
666 if (mfma.getReducePrecision() && chipset >=
kGfx942) {
667 if (m == 32 && n == 32 && k == 4 && b == 1)
668 return ROCDL::mfma_f32_32x32x4_xf32::getOperationName();
669 if (m == 16 && n == 16 && k == 8 && b == 1)
670 return ROCDL::mfma_f32_16x16x8_xf32::getOperationName();
672 if (m == 32 && n == 32 && k == 1 && b == 2)
673 return ROCDL::mfma_f32_32x32x1f32::getOperationName();
674 if (m == 16 && n == 16 && k == 1 && b == 4)
675 return ROCDL::mfma_f32_16x16x1f32::getOperationName();
676 if (m == 4 && n == 4 && k == 1 && b == 16)
677 return ROCDL::mfma_f32_4x4x1f32::getOperationName();
678 if (m == 32 && n == 32 && k == 2 && b == 1)
679 return ROCDL::mfma_f32_32x32x2f32::getOperationName();
680 if (m == 16 && n == 16 && k == 4 && b == 1)
681 return ROCDL::mfma_f32_16x16x4f32::getOperationName();
686 if (m == 32 && n == 32 && k == 16 && b == 1)
687 return ROCDL::mfma_f32_32x32x16_f16::getOperationName();
688 if (m == 16 && n == 16 && k == 32 && b == 1)
689 return ROCDL::mfma_f32_16x16x32_f16::getOperationName();
691 if (m == 32 && n == 32 && k == 4 && b == 2)
692 return ROCDL::mfma_f32_32x32x4f16::getOperationName();
693 if (m == 16 && n == 16 && k == 4 && b == 4)
694 return ROCDL::mfma_f32_16x16x4f16::getOperationName();
695 if (m == 4 && n == 4 && k == 4 && b == 16)
696 return ROCDL::mfma_f32_4x4x4f16::getOperationName();
697 if (m == 32 && n == 32 && k == 8 && b == 1)
698 return ROCDL::mfma_f32_32x32x8f16::getOperationName();
699 if (m == 16 && n == 16 && k == 16 && b == 1)
700 return ROCDL::mfma_f32_16x16x16f16::getOperationName();
705 if (m == 32 && n == 32 && k == 16 && b == 1)
706 return ROCDL::mfma_f32_32x32x16_bf16::getOperationName();
707 if (m == 16 && n == 16 && k == 32 && b == 1)
708 return ROCDL::mfma_f32_16x16x32_bf16::getOperationName();
711 if (m == 32 && n == 32 && k == 4 && b == 2)
712 return ROCDL::mfma_f32_32x32x4bf16_1k::getOperationName();
713 if (m == 16 && n == 16 && k == 4 && b == 4)
714 return ROCDL::mfma_f32_16x16x4bf16_1k::getOperationName();
715 if (m == 4 && n == 4 && k == 4 && b == 16)
716 return ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName();
717 if (m == 32 && n == 32 && k == 8 && b == 1)
718 return ROCDL::mfma_f32_32x32x8bf16_1k::getOperationName();
719 if (m == 16 && n == 16 && k == 16 && b == 1)
720 return ROCDL::mfma_f32_16x16x16bf16_1k::getOperationName();
722 if (m == 32 && n == 32 && k == 2 && b == 2)
723 return ROCDL::mfma_f32_32x32x2bf16::getOperationName();
724 if (m == 16 && n == 16 && k == 2 && b == 4)
725 return ROCDL::mfma_f32_16x16x2bf16::getOperationName();
726 if (m == 4 && n == 4 && k == 2 && b == 16)
727 return ROCDL::mfma_f32_4x4x2bf16::getOperationName();
728 if (m == 32 && n == 32 && k == 4 && b == 1)
729 return ROCDL::mfma_f32_32x32x4bf16::getOperationName();
730 if (m == 16 && n == 16 && k == 8 && b == 1)
731 return ROCDL::mfma_f32_16x16x8bf16::getOperationName();
736 if (m == 32 && n == 32 && k == 32 && b == 1)
737 return ROCDL::mfma_i32_32x32x32_i8::getOperationName();
738 if (m == 16 && n == 16 && k == 64 && b == 1)
739 return ROCDL::mfma_i32_16x16x64_i8::getOperationName();
741 if (m == 32 && n == 32 && k == 4 && b == 2)
742 return ROCDL::mfma_i32_32x32x4i8::getOperationName();
743 if (m == 16 && n == 16 && k == 4 && b == 4)
744 return ROCDL::mfma_i32_16x16x4i8::getOperationName();
745 if (m == 4 && n == 4 && k == 4 && b == 16)
746 return ROCDL::mfma_i32_4x4x4i8::getOperationName();
747 if (m == 32 && n == 32 && k == 8 && b == 1)
748 return ROCDL::mfma_i32_32x32x8i8::getOperationName();
749 if (m == 16 && n == 16 && k == 16 && b == 1)
750 return ROCDL::mfma_i32_16x16x16i8::getOperationName();
751 if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >=
kGfx942)
752 return ROCDL::mfma_i32_32x32x16_i8::getOperationName();
753 if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >=
kGfx942)
754 return ROCDL::mfma_i32_16x16x32_i8::getOperationName();
758 if (m == 16 && n == 16 && k == 4 && b == 1)
759 return ROCDL::mfma_f64_16x16x4f64::getOperationName();
760 if (m == 4 && n == 4 && k == 4 && b == 4)
761 return ROCDL::mfma_f64_4x4x4f64::getOperationName();
768 cast<VectorType>(mfma.getSourceB().getType()).getElementType();
769 if (m == 16 && n == 16 && k == 32 && b == 1) {
771 return ROCDL::mfma_f32_16x16x32_bf8_bf8::getOperationName();
773 return ROCDL::mfma_f32_16x16x32_bf8_fp8::getOperationName();
775 if (m == 32 && n == 32 && k == 16 && b == 1) {
777 return ROCDL::mfma_f32_32x32x16_bf8_bf8::getOperationName();
779 return ROCDL::mfma_f32_32x32x16_bf8_fp8::getOperationName();
785 cast<VectorType>(mfma.getSourceB().getType()).getElementType();
786 if (m == 16 && n == 16 && k == 32 && b == 1) {
788 return ROCDL::mfma_f32_16x16x32_fp8_bf8::getOperationName();
790 return ROCDL::mfma_f32_16x16x32_fp8_fp8::getOperationName();
792 if (m == 32 && n == 32 && k == 16 && b == 1) {
794 return ROCDL::mfma_f32_32x32x16_fp8_bf8::getOperationName();
796 return ROCDL::mfma_f32_32x32x16_fp8_fp8::getOperationName();
805 .Case([](Float8E4M3FNType) {
return 0u; })
806 .Case([](Float8E5M2Type) {
return 1u; })
807 .Case([](Float6E2M3FNType) {
return 2u; })
808 .Case([](Float6E3M2FNType) {
return 3u; })
809 .Case([](Float4E2M1FNType) {
return 4u; })
810 .Default([](
Type) {
return std::nullopt; });
820 static std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
822 uint32_t n, uint32_t k, uint32_t b,
Chipset chipset) {
829 if (!isa<Float32Type>(destType))
834 if (!aTypeCode || !bTypeCode)
837 if (m == 32 && n == 32 && k == 64 && b == 1)
838 return std::tuple{ROCDL::mfma_scale_f32_32x32x64_f8f6f4::getOperationName(),
839 *aTypeCode, *bTypeCode};
840 if (m == 16 && n == 16 && k == 128 && b == 1)
842 ROCDL::mfma_scale_f32_16x16x128_f8f6f4::getOperationName(), *aTypeCode,
848 static std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
851 mfma.getSourceA().getType(), mfma.getSourceB().getType(),
852 mfma.getDestC().getType(), mfma.getM(), mfma.getN(), mfma.getK(),
853 mfma.getBlocks(), chipset);
856 static std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
859 smfma.getSourceB().getType(),
860 smfma.getDestC().getType(), smfma.getM(),
861 smfma.getN(), smfma.getK(), 1u, chipset);
869 auto sourceVectorType = dyn_cast<VectorType>(wmma.getSourceA().getType());
870 auto sourceBVectorType = dyn_cast<VectorType>(wmma.getSourceB().getType());
871 auto destVectorType = dyn_cast<VectorType>(wmma.getDestC().getType());
872 auto elemSourceType = sourceVectorType.getElementType();
873 auto elemBSourceType = sourceBVectorType.getElementType();
874 auto elemDestType = destVectorType.getElementType();
876 if (elemSourceType.isF16() && elemDestType.isF32())
877 return ROCDL::wmma_f32_16x16x16_f16::getOperationName();
878 if (elemSourceType.isBF16() && elemDestType.isF32())
879 return ROCDL::wmma_f32_16x16x16_bf16::getOperationName();
880 if (elemSourceType.isF16() && elemDestType.isF16())
881 return ROCDL::wmma_f16_16x16x16_f16::getOperationName();
882 if (elemSourceType.isBF16() && elemDestType.isBF16())
883 return ROCDL::wmma_bf16_16x16x16_bf16::getOperationName();
884 if (elemSourceType.isInteger(8) && elemDestType.isInteger(32))
885 return ROCDL::wmma_i32_16x16x16_iu8::getOperationName();
887 if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
888 return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
891 if (isa<Float8E4M3FNType>(elemSourceType) &&
892 isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
893 return ROCDL::wmma_f32_16x16x16_fp8_fp8::getOperationName();
894 if (isa<Float8E4M3FNType>(elemSourceType) &&
895 isa<Float8E5M2Type>(elemBSourceType) && elemDestType.isF32())
896 return ROCDL::wmma_f32_16x16x16_fp8_bf8::getOperationName();
897 if (isa<Float8E5M2Type>(elemSourceType) &&
898 isa<Float8E5M2Type>(elemBSourceType) && elemDestType.isF32())
899 return ROCDL::wmma_f32_16x16x16_bf8_bf8::getOperationName();
900 if (isa<Float8E5M2Type>(elemSourceType) &&
901 isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
902 return ROCDL::wmma_f32_16x16x16_bf8_fp8::getOperationName();
903 if (elemSourceType.isInteger(4) && elemDestType.isInteger(32)) {
904 bool isWave64 = destVectorType.getNumElements() == 4;
907 bool has8Inputs = sourceVectorType.getNumElements() == 8;
908 if ((isWave64 && has8Inputs) || (!isWave64 && !has8Inputs))
909 return ROCDL::wmma_i32_16x16x32_iu4::getOperationName();
910 return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
924 matchAndRewrite(MFMAOp op, MFMAOpAdaptor adaptor,
927 Type outType = typeConverter->convertType(op.getDestD().getType());
928 Type intrinsicOutType = outType;
929 if (
auto outVecType = dyn_cast<VectorType>(outType))
930 if (outVecType.getElementType().isBF16())
931 intrinsicOutType = outVecType.clone(rewriter.
getI16Type());
934 return op->emitOpError(
"MFMA only supported on gfx908+");
935 uint32_t getBlgpField =
static_cast<uint32_t
>(op.getBlgp());
936 if (op.getNegateA() || op.getNegateB() || op.getNegateC()) {
938 return op.emitOpError(
"negation unsupported on older than gfx942");
940 op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2);
943 std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
945 if (!maybeIntrinsic.has_value() && !maybeScaledIntrinsic.has_value())
946 return op.emitOpError(
"no intrinsic matching MFMA size on given chipset");
949 !maybeIntrinsic.has_value() && maybeScaledIntrinsic.has_value();
951 (adaptor.getAbid() > 0 || getBlgpField > 0 || op.getCbsz() > 0)) {
952 return op.emitOpError(
953 "non-default abid, blgp, and cbsz aren't supported on MFMAs that can "
954 "be scaled as those fields are used for type information");
957 StringRef intrinsicName =
958 isScaled ? std::get<0>(*maybeScaledIntrinsic) : *maybeIntrinsic;
960 loweredOp.addTypes(intrinsicOutType);
961 loweredOp.addOperands(
964 adaptor.getDestC()});
967 auto [_scaledName, aTypeCode, bTypeCode] = *maybeScaledIntrinsic;
978 if (outType != intrinsicOutType)
979 lowered = rewriter.
create<LLVM::BitcastOp>(loc, outType, lowered);
992 matchAndRewrite(ScaledMFMAOp op, ScaledMFMAOpAdaptor adaptor,
995 Type intrinsicOutType = typeConverter->convertType(op.getDestD().getType());
998 return op->emitOpError(
"scaled MFMA only supported on gfx908+");
999 std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
1001 if (!maybeScaledIntrinsic.has_value())
1002 return op.emitOpError(
1003 "no intrinsic matching scaled MFMA size on given chipset");
1005 auto [intrinsicName, aTypeCode, bTypeCode] = *maybeScaledIntrinsic;
1007 loweredOp.addTypes(intrinsicOutType);
1008 loweredOp.addOperands(
1011 adaptor.getDestC()});
1016 loweredOp.addOperands(
1038 matchAndRewrite(WMMAOp op, WMMAOpAdaptor adaptor,
1042 typeConverter->convertType<VectorType>(op.getDestD().getType());
1047 return op->emitOpError(
"WMMA only supported on gfx11 and gfx12");
1051 VectorType rawOutType = outType;
1052 if (outType.getElementType().
isBF16())
1053 rawOutType = outType.clone(rewriter.
getI16Type());
1057 if (!maybeIntrinsic.has_value())
1058 return op.emitOpError(
"no intrinsic matching WMMA on the given chipset");
1060 if (chipset.
majorVersion >= 12 && op.getSubwordOffset() != 0)
1061 return op.emitOpError(
"subwordOffset not supported on gfx12+");
1064 loweredOp.addTypes(rawOutType);
1068 adaptor.getSourceA(), op.getSourceA(), operands);
1070 adaptor.getSourceB(), op.getSourceB(), operands);
1072 op.getSubwordOffset(), op.getClamp(), operands);
1074 loweredOp.addOperands(operands);
1078 if (rawOutType != outType)
1080 rewriter.
create<LLVM::BitcastOp>(loc, outType, lowered->
getResult(0));
1094 matchAndRewrite(GatherToLDSOp op, GatherToLDSOpAdaptor adaptor,
1097 return op.emitOpError(
"pre-gfx9 and post-gfx10 not supported");
1101 auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
1102 auto dstMemRefType = cast<MemRefType>(op.getSrc().getType());
1107 Type transferType = op.getTransferType();
1108 size_t loadWidth = [&]() ->
size_t {
1109 if (
auto transferVectorType = dyn_cast<VectorType>(transferType)) {
1110 return transferVectorType.getNumElements() *
1111 (transferVectorType.getElementTypeBitWidth() / 8);
1117 if (loadWidth != 1 && loadWidth != 2 && loadWidth != 4)
1118 return op.emitOpError(
"chipset unsupported element size");
1121 getStridedElementPtr(rewriter, loc, srcMemRefType, adaptor.getSrc(),
1122 (adaptor.getSrcIndices()));
1124 getStridedElementPtr(rewriter, loc, dstMemRefType, adaptor.getDst(),
1125 (adaptor.getDstIndices()));
1138 struct ExtPackedFp8OpLowering final
1146 matchAndRewrite(ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor,
1150 struct PackedTrunc2xFp8OpLowering final
1159 matchAndRewrite(PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor,
1163 struct PackedStochRoundFp8OpLowering final
1172 matchAndRewrite(PackedStochRoundFp8Op op,
1173 PackedStochRoundFp8OpAdaptor adaptor,
1178 LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
1179 ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor,
1184 loc,
"Fp8 conversion instructions are not available on target "
1185 "architecture and their emulation is not implemented");
1188 Type i32 = getTypeConverter()->convertType(rewriter.
getI32Type());
1189 Type f32 = getTypeConverter()->convertType(op.getResult().getType());
1191 Value source = adaptor.getSource();
1192 auto sourceVecType = dyn_cast<VectorType>(op.getSource().getType());
1193 auto resultVecType = dyn_cast<VectorType>(op.getResult().getType());
1196 if (!sourceVecType || sourceVecType.getNumElements() < 4) {
1197 Value longVec = rewriter.
create<LLVM::UndefOp>(loc, v4i8);
1198 if (!sourceVecType) {
1199 longVec = rewriter.
create<LLVM::InsertElementOp>(
1202 for (int32_t i = 0, e = sourceVecType.getNumElements(); i < e; ++i) {
1204 Value elem = rewriter.
create<LLVM::ExtractElementOp>(loc, source, idx);
1206 rewriter.
create<LLVM::InsertElementOp>(loc, longVec, elem, idx);
1211 Value i32Source = rewriter.
create<LLVM::BitcastOp>(loc, i32, source);
1212 if (resultVecType) {
1234 LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite(
1235 PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor,
1240 loc,
"Fp8 conversion instructions are not available on target "
1241 "architecture and their emulation is not implemented");
1242 Type i32 = getTypeConverter()->convertType(rewriter.
getI32Type());
1244 Type resultType = op.getResult().getType();
1247 Value sourceA = adaptor.getSourceA();
1248 Value sourceB = adaptor.getSourceB();
1250 sourceB = rewriter.
create<LLVM::UndefOp>(loc, sourceA.
getType());
1251 Value existing = adaptor.getExisting();
1253 existing = rewriter.
create<LLVM::BitcastOp>(loc, i32, existing);
1255 existing = rewriter.
create<LLVM::UndefOp>(loc, i32);
1260 result = rewriter.
create<ROCDL::CvtPkBf8F32Op>(loc, i32, sourceA, sourceB,
1263 result = rewriter.
create<ROCDL::CvtPkFp8F32Op>(loc, i32, sourceA, sourceB,
1267 op, getTypeConverter()->convertType(resultType), result);
1271 LogicalResult PackedStochRoundFp8OpLowering::matchAndRewrite(
1272 PackedStochRoundFp8Op op, PackedStochRoundFp8OpAdaptor adaptor,
1277 loc,
"Fp8 conversion instructions are not available on target "
1278 "architecture and their emulation is not implemented");
1279 Type i32 = getTypeConverter()->convertType(rewriter.
getI32Type());
1281 Type resultType = op.getResult().getType();
1284 Value source = adaptor.getSource();
1285 Value stoch = adaptor.getStochiasticParam();
1286 Value existing = adaptor.getExisting();
1288 existing = rewriter.
create<LLVM::BitcastOp>(loc, i32, existing);
1290 existing = rewriter.
create<LLVM::UndefOp>(loc, i32);
1295 result = rewriter.
create<ROCDL::CvtSrBf8F32Op>(loc, i32, source, stoch,
1298 result = rewriter.
create<ROCDL::CvtSrFp8F32Op>(loc, i32, source, stoch,
1302 op, getTypeConverter()->convertType(resultType), result);
1314 matchAndRewrite(DPPOp DppOp, DPPOp::Adaptor adaptor,
1319 Value src = adaptor.getSrc();
1320 Value old = adaptor.getOld();
1323 Type llvmType =
nullptr;
1326 }
else if (isa<FloatType>(srcType)) {
1330 }
else if (isa<IntegerType>(srcType)) {
1335 auto llvmSrcIntType = typeConverter->convertType(
1339 auto convertOperand = [&](
Value operand,
Type operandType) {
1340 if (operandType.getIntOrFloatBitWidth() <= 16) {
1341 if (llvm::isa<FloatType>(operandType)) {
1343 rewriter.
create<LLVM::BitcastOp>(loc, llvmSrcIntType, operand);
1346 32 / operandType.getIntOrFloatBitWidth(), llvmSrcIntType));
1347 Value undefVec = rewriter.
create<LLVM::UndefOp>(loc, llvmVecType);
1348 operand = rewriter.
create<LLVM::InsertElementOp>(
1350 operand = rewriter.
create<LLVM::BitcastOp>(loc, llvmType, operand);
1355 src = convertOperand(src, srcType);
1356 old = convertOperand(old, oldType);
1359 enum DppCtrl :
unsigned {
1368 ROW_HALF_MIRROR = 0x141,
1373 auto kind = DppOp.getKind();
1374 auto permArgument = DppOp.getPermArgument();
1375 uint32_t DppCtrl = 0;
1379 case DPPPerm::quad_perm:
1380 if (
auto quadPermAttr = cast<ArrayAttr>(*permArgument)) {
1382 for (
auto elem : quadPermAttr.getAsRange<IntegerAttr>()) {
1383 uint32_t num = elem.getInt();
1384 DppCtrl |= num << (i * 2);
1389 case DPPPerm::row_shl:
1390 if (
auto intAttr = cast<IntegerAttr>(*permArgument)) {
1391 DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHL0;
1394 case DPPPerm::row_shr:
1395 if (
auto intAttr = cast<IntegerAttr>(*permArgument)) {
1396 DppCtrl = intAttr.getInt() + DppCtrl::ROW_SHR0;
1399 case DPPPerm::row_ror:
1400 if (
auto intAttr = cast<IntegerAttr>(*permArgument)) {
1401 DppCtrl = intAttr.getInt() + DppCtrl::ROW_ROR0;
1404 case DPPPerm::wave_shl:
1405 DppCtrl = DppCtrl::WAVE_SHL1;
1407 case DPPPerm::wave_shr:
1408 DppCtrl = DppCtrl::WAVE_SHR1;
1410 case DPPPerm::wave_rol:
1411 DppCtrl = DppCtrl::WAVE_ROL1;
1413 case DPPPerm::wave_ror:
1414 DppCtrl = DppCtrl::WAVE_ROR1;
1416 case DPPPerm::row_mirror:
1417 DppCtrl = DppCtrl::ROW_MIRROR;
1419 case DPPPerm::row_half_mirror:
1420 DppCtrl = DppCtrl::ROW_HALF_MIRROR;
1422 case DPPPerm::row_bcast_15:
1423 DppCtrl = DppCtrl::BCAST15;
1425 case DPPPerm::row_bcast_31:
1426 DppCtrl = DppCtrl::BCAST31;
1432 auto rowMask = DppOp->getAttrOfType<IntegerAttr>(
"row_mask").getInt();
1433 auto bankMask = DppOp->getAttrOfType<IntegerAttr>(
"bank_mask").getInt();
1434 bool boundCtrl = DppOp->getAttrOfType<
BoolAttr>(
"bound_ctrl").getValue();
1437 auto dppMovOp = rewriter.
create<ROCDL::DPPUpdateOp>(
1438 loc, llvmType, old, src, DppCtrl, rowMask, bankMask, boundCtrl);
1440 Value result = dppMovOp.getRes();
1442 result = rewriter.
create<LLVM::TruncOp>(loc, llvmSrcIntType, result);
1443 if (!llvm::isa<IntegerType>(srcType)) {
1444 result = rewriter.
create<LLVM::BitcastOp>(loc, srcType, result);
1455 struct AMDGPUSwizzleBitModeLowering
1460 matchAndRewrite(SwizzleBitModeOp op, OpAdaptor adaptor,
1464 Value src = adaptor.getSrc();
1467 unsigned andMask = op.getAndMask();
1468 unsigned orMask = op.getOrMask();
1469 unsigned xorMask = op.getXorMask();
1473 unsigned mask = andMask | (orMask << 5) | (xorMask << 10);
1476 for (
Value v : decomposed) {
1478 rewriter.
create<ROCDL::DsSwizzleOp>(loc, v.getType(), v, maskValue);
1479 swizzled.emplace_back(res);
1488 struct ConvertAMDGPUToROCDLPass
1489 :
public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
1492 void runOnOperation()
override {
1495 if (failed(maybeChipset)) {
1497 return signalPassFailure();
1504 target.addIllegalDialect<::mlir::amdgpu::AMDGPUDialect>();
1505 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
1506 target.addLegalDialect<::mlir::ROCDL::ROCDLDialect>();
1509 signalPassFailure();
1521 switch (as.getValue()) {
1522 case amdgpu::AddressSpace::FatRawBuffer:
1524 case amdgpu::AddressSpace::BufferRsrc:
1526 case amdgpu::AddressSpace::FatStructuredBuffer:
1538 .add<FatRawBufferCastLowering,
1539 RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawPtrBufferLoadOp>,
1540 RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawPtrBufferStoreOp>,
1541 RawBufferOpLowering<RawBufferAtomicFaddOp,
1542 ROCDL::RawPtrBufferAtomicFaddOp>,
1543 RawBufferOpLowering<RawBufferAtomicFmaxOp,
1544 ROCDL::RawPtrBufferAtomicFmaxOp>,
1545 RawBufferOpLowering<RawBufferAtomicSmaxOp,
1546 ROCDL::RawPtrBufferAtomicSmaxOp>,
1547 RawBufferOpLowering<RawBufferAtomicUminOp,
1548 ROCDL::RawPtrBufferAtomicUminOp>,
1549 RawBufferOpLowering<RawBufferAtomicCmpswapOp,
1550 ROCDL::RawPtrBufferAtomicCmpSwap>,
1551 AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering,
1552 MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
1553 ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
1554 PackedStochRoundFp8OpLowering, GatherToLDSOpLowering>(converter,
1556 patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
static bool typeIsExpectedFp8ForChipset(Chipset chipset, Type type)
Return true if type is the E4M3FN variant of an 8-bit float that is supported by the _fp8 instruction...
constexpr Chipset kGfx942
static std::optional< StringRef > wmmaOpToIntrinsic(WMMAOp wmma, Chipset chipset)
Return the rocdl intrinsic corresponding to a WMMA operation wmma if one exists.
static Value createI1Constant(ConversionPatternRewriter &rewriter, Location loc, bool value)
constexpr Chipset kGfx908
constexpr Chipset kGfx90a
static std::optional< StringRef > mfmaOpToIntrinsic(MFMAOp mfma, Chipset chipset)
Return the rocdl intrinsic corresponding to a MFMA operation mfma if one exists.
static void wmmaPushOutputOperand(ConversionPatternRewriter &rewriter, Location loc, const TypeConverter *typeConverter, Value output, int32_t subwordOffset, bool clamp, SmallVector< Value, 4 > &operands)
Push the output operand.
static bool typeIsExpectedBf8ForChipset(Chipset chipset, Type type)
Return true if type is the E5M2 variant of an 8-bit float that is supported by the _bf8 instructions ...
static Value makeBufferRsrc(ConversionPatternRewriter &rewriter, Location loc, Value basePointer, Value numRecords, bool boundsCheck, amdgpu::Chipset chipset, Value cacheSwizzleStride=nullptr, unsigned addressSpace=8)
static void wmmaPushInputOperand(ConversionPatternRewriter &rewriter, Location loc, const TypeConverter *typeConverter, bool isUnsigned, Value llvmInput, Value mlirInput, SmallVector< Value, 4 > &operands)
Push an input operand.
static Value convertMFMAVectorOperand(ConversionPatternRewriter &rewriter, Location loc, Value input)
Converts a MFMA vector operand from MLIR AMDGPU dialect convention to ROCDL and LLVM AMDGPU intrinsic...
static Value getLinearIndexI32(ConversionPatternRewriter &rewriter, Location loc, MemRefDescriptor &memRefDescriptor, ValueRange indices, ArrayRef< int64_t > strides)
Returns the linear index used to access an element in the memref.
static Value convertUnsignedToI32(ConversionPatternRewriter &rewriter, Location loc, Value val)
Convert an unsigned number val to i32.
static Value castMFMAScaleOperand(ConversionPatternRewriter &rewriter, Location loc, Value input)
Converts the scaled MFMA operands, scalesA and scalesB, from MLIR AMDGPU dialect convention to ROCDL ...
static Value createI32Constant(ConversionPatternRewriter &rewriter, Location loc, int32_t value)
static Value getNumRecords(ConversionPatternRewriter &rewriter, Location loc, MemRefType memrefType, MemRefDescriptor &memrefDescriptor, ArrayRef< int64_t > strides, uint32_t elementByteWidth)
Compute the contents of the num_records field for a given memref descriptor - that is,...
static std::optional< uint32_t > mfmaTypeSelectCode(Type mlirElemType)
static std::optional< std::tuple< StringRef, uint32_t, uint32_t > > mfmaOpToScaledIntrinsic(Type aType, Type bType, Type destType, uint32_t m, uint32_t n, uint32_t k, uint32_t b, Chipset chipset)
If there is a scaled MFMA instruction for the input element types aType and bType,...
constexpr Chipset kGfx950
static MLIRContext * getContext(OpFoldResult val)
union mlir::linalg::@1195::ArityGroupAndKind::Kind kind
static constexpr unsigned kSizePosInMemRefDescriptor
static constexpr unsigned kStridePosInMemRefDescriptor
static constexpr unsigned kOffsetPosInMemRefDescriptor
static constexpr unsigned kAllocatedPtrPosInMemRefDescriptor
static constexpr unsigned kAlignedPtrPosInMemRefDescriptor
static Value clamp(ImplicitLocOpBuilder &builder, Value value, Value lowerBound, Value upperBound)
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
This class provides a shared interface for ranked and unranked memref types.
Special case of IntegerAttr to represent boolean integers, i.e., signless i1 integers.
IntegerAttr getIndexAttr(int64_t value)
IntegerAttr getI32IntegerAttr(int32_t value)
IntegerAttr getI16IntegerAttr(int16_t value)
IntegerType getIntegerType(unsigned width)
MLIRContext * getContext() const
This class implements a pattern rewriter for use with ConversionPatterns.
void replaceOp(Operation *op, ValueRange newValues) override
Replace the given operation with the new values.
void eraseOp(Operation *op) override
PatternRewriter hook for erasing a dead operation.
Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...
ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)
The main mechanism for performing data layout queries.
static DataLayout closest(Operation *op)
Returns the layout of the closest parent operation carrying layout info.
llvm::TypeSize getTypeSizeInBits(Type t) const
Returns the size in bits of the given type in the current scope.
Derived class that automatically populates legalization information for different LLVM ops.
Conversion from types to the LLVM IR dialect.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
Helper class to produce LLVM dialect operations extracting or inserting elements of a MemRef descript...
Value stride(OpBuilder &builder, Location loc, unsigned pos)
Builds IR extracting the pos-th size from the descriptor.
static MemRefDescriptor poison(OpBuilder &builder, Location loc, Type descriptorType)
Builds IR creating a poison value of the descriptor type.
Value size(OpBuilder &builder, Location loc, unsigned pos)
Builds IR extracting the pos-th size from the descriptor.
void createOrFold(SmallVectorImpl< Value > &results, Location location, Args &&...args)
Create an operation of specific op type at the current insertion point, and immediately try to fold i...
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Operation is the basic unit of execution within MLIR.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
result_range getResults()
unsigned getNumResults()
Return the number of results held by this operation.
std::enable_if_t<!std::is_convertible< CallbackT, Twine >::value, LogicalResult > notifyMatchFailure(Location loc, CallbackT &&reasonCallback)
Used to notify the listener that the IR failed to be rewritten because of a match failure,...
OpTy replaceOpWithNewOp(Operation *op, Args &&...args)
Replace the results of the given (original) op with a new op that is created without verification (re...
The general result of a type attribute conversion callback, allowing for early termination.
static AttributeConversionResult abort()
LogicalResult convertType(Type t, SmallVectorImpl< Type > &results) const
Convert the given type.
void addTypeAttributeConversion(FnT &&callback)
Register a conversion function for attributes within types.
This class provides an abstraction over the various different ranges of value types.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isSignedInteger() const
Return true if this is a signed integer type (with the specified width).
bool isUnsignedInteger() const
Return true if this is an unsigned integer type (with the specified width).
bool isInteger() const
Return true if this is an integer type (with the specified width).
unsigned getIntOrFloatBitWidth() const
Return the bit width of an integer or a float type, assert failure on other types.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
Type getType() const
Return the type of this value.
Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)
Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...
SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)
Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...
bool hasOcpFp8(const Chipset &chipset)
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
llvm::TypeSize divideCeil(llvm::TypeSize numerator, uint64_t denominator)
Divides the known min value of the numerator by the denominator and rounds the result up to the next ...
Include the generated interface declarations.
void populateAMDGPUMemorySpaceAttributeConversions(TypeConverter &typeConverter)
Remap AMDGPU memory spaces to LLVM address spaces by mapping amdgpu::AddressSpace::fat_raw_buffer to ...
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
Type getElementTypeOrSelf(Type type)
Return the element type or return the type itself.
const FrozenRewritePatternSet & patterns
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)
Note: This function will also add conversions for the AMDGPU-specific address spaces,...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())
Below we define several entry points for operation conversion.
This represents an operation in an abstracted form, suitable for use with the builder APIs.
Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.
static FailureOr< Chipset > parse(StringRef name)
Parses the chipset version string and returns the chipset on success, and failure otherwise.