@@ -2599,51 +2599,63 @@ def NVVM_CpAsyncBulkSharedCTAToSharedClusterOp :
2599
2599
}
2600
2600
2601
2601
def NVVM_CpAsyncBulkSharedCTAToGlobalOp :
2602
- NVVM_Op<"cp.async.bulk.global.shared.cta"> {
2602
+ NVVM_Op<"cp.async.bulk.global.shared.cta", [AttrSizedOperandSegments] > {
2603
2603
let summary = "Async bulk copy from Shared CTA memory to Global memory";
2604
2604
let description = [{
2605
2605
Initiates an asynchronous copy operation from Shared CTA memory to
2606
- global memory.
2606
+ global memory. The 32-bit operand `size` specifies the amount of
2607
+ memory to be copied, in terms of number of bytes. `size` must be a
2608
+ multiple of 16. The `l2CacheHint` operand is optional, and it is used
2609
+ to specify cache eviction policy that may be used during the memory
2610
+ access. The `byteMask` operand is optional. The i-th bit in the 16-bit
2611
+ wide `byteMask` specifies whether the i-th byte of each 16-byte wide
2612
+ chunk of source data is copied to the destination. If the bit is set,
2613
+ the byte is copied.
2614
+
2615
+ Example:
2616
+ ```mlir
2617
+ nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size
2618
+ : !llvm.ptr<1>, !llvm.ptr<3>
2619
+
2620
+ // with l2_cache_hint
2621
+ nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch
2622
+ : !llvm.ptr<1>, !llvm.ptr<3>
2623
+
2624
+ // with byte_mask
2625
+ nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size byte_mask = %mask
2626
+ : !llvm.ptr<1>, !llvm.ptr<3>
2627
+
2628
+ // with both l2_cache_hint and byte_mask
2629
+ nvvm.cp.async.bulk.global.shared.cta %dst, %src, %size l2_cache_hint = %ch byte_mask = %mask
2630
+ : !llvm.ptr<1>, !llvm.ptr<3>
2631
+ ```
2607
2632
2608
- The `l2CacheHint` operand is optional, and it is used to specify cache
2609
- eviction policy that may be used during the memory access.
2610
-
2611
2633
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
2612
2634
}];
2613
2635
2614
2636
let arguments = (ins
2615
2637
LLVM_PointerGlobal:$dstMem,
2616
2638
LLVM_PointerShared:$srcMem,
2617
2639
I32:$size,
2618
- Optional<I64>:$l2CacheHint);
2640
+ Optional<I64>:$l2CacheHint,
2641
+ Optional<I16>:$byteMask);
2619
2642
2620
2643
let assemblyFormat = [{
2621
2644
$dstMem `,` $srcMem `,` $size
2622
2645
(`l2_cache_hint` `=` $l2CacheHint^ )?
2623
- attr-dict `:` type($dstMem) `,` type($srcMem)
2646
+ (`byte_mask` `=` $byteMask^ )?
2647
+ attr-dict `:` type($dstMem) `,` type($srcMem)
2624
2648
}];
2625
2649
2650
+ let extraClassDeclaration = [{
2651
+ static mlir::NVVM::IDArgPair
2652
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
2653
+ llvm::IRBuilderBase& builder);
2654
+ }];
2626
2655
string llvmBuilder = [{
2627
- // Arguments to the intrinsic:
2628
- // dst, src, size, cache_hint,
2629
- // Flag for cache_hint
2630
- //
2631
- llvm::SmallVector<llvm::Value *> translatedOperands;
2632
- translatedOperands.push_back($dstMem);
2633
- translatedOperands.push_back($srcMem);
2634
- translatedOperands.push_back($size);
2635
-
2636
- // Cachehint, if available
2637
- llvm::LLVMContext &ctx = moduleTranslation.getLLVMContext();
2638
- auto *i64Unused = llvm::ConstantInt::get(llvm::Type::getInt64Ty(ctx), 0);
2639
- bool isCacheHint = op.getL2CacheHint() ? true : false;
2640
- translatedOperands.push_back(isCacheHint ? $l2CacheHint : i64Unused);
2641
-
2642
- // Flag argument for cachehint
2643
- translatedOperands.push_back(builder.getInt1(isCacheHint));
2644
-
2645
- createIntrinsicCall(builder,
2646
- llvm::Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global, translatedOperands);
2656
+ auto [id, args] = NVVM::CpAsyncBulkSharedCTAToGlobalOp::getIntrinsicIDAndArgs(
2657
+ *op, moduleTranslation, builder);
2658
+ createIntrinsicCall(builder, id, args);
2647
2659
}];
2648
2660
}
2649
2661
0 commit comments