20 typename ComputeTypeA,
21 typename ComputeTypeB,
23 typename AWmmaTileDesc,
24 typename BWmmaTileDesc,
25 index_t ABlockTransferSrcScalarPerVector,
26 index_t BBlockTransferSrcScalarPerVector,
35 bool TransposeC =
false>
43 typename ComputeTypeA,
44 typename ComputeTypeB,
46 typename AWmmaTileDesc,
47 typename BWmmaTileDesc,
48 index_t ABlockTransferSrcScalarPerVector,
49 index_t BBlockTransferSrcScalarPerVector,
68 ABlockTransferSrcScalarPerVector,
69 BBlockTransferSrcScalarPerVector,
87 ABlockTransferSrcScalarPerVector,
88 BBlockTransferSrcScalarPerVector,
107 ABlockTransferSrcScalarPerVector,
108 BBlockTransferSrcScalarPerVector,
134 GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
137 GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
142 using typename Base::Empty;
156 template <
bool HasMainLoop,
160 typename ABlockTransfer,
161 typename AGridBuffer,
162 typename ABlockBuffer,
163 typename ABlockTransferStep,
166 typename BBlockTransfer,
167 typename BGridBuffer,
168 typename BBlockBuffer,
169 typename BBlockTransferStep,
170 typename CThreadBuffer,
171 typename BScaleStruct>
172 __device__
void Run(
const AGridDesc& a_grid_desc,
173 const ABlockDesc& a_block_desc,
174 ABlockTransfer& a_blockwise_copy,
175 const AGridBuffer& a_grid_buf,
176 ABlockBuffer& a_block_buf,
177 const ABlockTransferStep& a_block_copy_step,
178 const BGridDesc& b_grid_desc,
179 const BBlockDesc& b_block_desc,
180 BBlockTransfer& b_blockwise_copy,
181 const BGridBuffer& b_grid_buf,
182 BBlockBuffer& b_block_buf,
183 const BBlockTransferStep& b_block_copy_step,
184 CThreadBuffer& c_thread_buf,
186 BScaleStruct& b_scale_struct,
188 index_t num_loop_per_scale)
const
196 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
197 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
199 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
200 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
202 b_scale_struct.template GlobalLoad<0>(num_loop_per_scale == 1);
205 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
206 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
209 c_thread_buf.Clear();
211 auto blockwise_gemm_func = [&]() {
222 if constexpr(m0 ==
I0)
245 b_scale_struct.b_scale_thread_bufs(
246 I0)[
Number<n0 * BScaleStruct::num_scale_k_block +
247 k0 / BScaleStruct::num_scale_krepeat>{}],
260 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
265 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
270 using wmma_input_type_a =
272 using wmma_input_type_b =
278 wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
279 b_thread_vec.template AsType<wmma_input_type_b>(),
287 if constexpr(HasMainLoop)
292 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
293 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
295 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
296 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
299 blockwise_gemm_func();
302 b_scale_struct.template GlobalLoad<0>((i + 2) % num_loop_per_scale == 0);
307 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
308 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
310 constexpr index_t num_ds_write_inst =
316 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
320 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
321 if constexpr(m0 ==
I0)
324 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
328 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
333 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
337 }
while(i < (num_loop - 1));
344 blockwise_gemm_func();
381 using Base::c_thread_desc_;
387 typename ComputeTypeA,
388 typename ComputeTypeB,
389 typename AccDataType,
390 typename AWmmaTileDesc,
391 typename BWmmaTileDesc,
392 index_t ABlockTransferSrcScalarPerVector,
393 index_t BBlockTransferSrcScalarPerVector,
412 ABlockTransferSrcScalarPerVector,
413 BBlockTransferSrcScalarPerVector,
431 ABlockTransferSrcScalarPerVector,
432 BBlockTransferSrcScalarPerVector,
451 ABlockTransferSrcScalarPerVector,
452 BBlockTransferSrcScalarPerVector,
476 GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
479 GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs;
484 using typename Base::Empty;
501 template <
bool HasMainLoop,
505 typename ABlockTransfer,
506 typename AGridBuffer,
507 typename ABlockBuffer,
508 typename ABlockTransferStep,
511 typename BBlockTransfer,
512 typename BGridBuffer,
513 typename BBlockBuffer,
514 typename BBlockTransferStep,
515 typename CThreadBuffer,
516 typename BScaleStruct>
517 __device__
void Run(
const AGridDesc& a_grid_desc,
518 const ABlockDesc& a_block_desc,
519 ABlockTransfer& a_blockwise_copy,
520 const AGridBuffer& a_grid_buf,
521 ABlockBuffer& a_block_buf,
522 const ABlockTransferStep& a_block_copy_step,
523 const BGridDesc& b_grid_desc,
524 const BBlockDesc& b_block_desc,
525 BBlockTransfer& b_blockwise_copy,
526 const BGridBuffer& b_grid_buf,
527 BBlockBuffer& b_block_buf,
528 const BBlockTransferStep& b_block_copy_step,
529 CThreadBuffer& c_thread_buf,
531 BScaleStruct& b_scale_struct,
533 index_t num_loop_per_scale)
const
541 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
542 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
544 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
545 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
547 b_scale_struct.template GlobalLoad<0>(num_loop_per_scale == 1);
550 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
551 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
554 c_thread_buf.Clear();
556 auto blockwise_gemm_func = [&]() {
602 b_scale_struct.b_scale_thread_bufs(
I0)[
Number<
603 n0 * BScaleStruct::num_scale_k_block +
604 (k0_offset + k0_inner) / BScaleStruct::num_scale_krepeat>{}],
612 __builtin_amdgcn_sched_barrier(0);
619 if constexpr(k0_offset != 0 ||
KRepeat == 1)
621 __builtin_amdgcn_s_barrier();
622 __builtin_amdgcn_sched_barrier(0);
631 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
641 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
651 using wmma_input_type_a =
653 using wmma_input_type_b =
665 if constexpr(k0_offset + k0_inner ==
KRepeat - 1 && m0 == MRepeat - 1 &&
668 __builtin_amdgcn_sched_barrier(0);
670 __builtin_amdgcn_sched_barrier(0);
672 wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
673 b_thread_vec.template AsType<wmma_input_type_b>(),
675 if constexpr(k0_inner == 0 && m0 == 0 && n0 == 0)
677 __builtin_amdgcn_sched_barrier(0);
678 __builtin_amdgcn_s_setprio(1);
679 __builtin_amdgcn_sched_barrier(0);
684 __builtin_amdgcn_sched_barrier(0);
685 __builtin_amdgcn_s_setprio(0);
686 __builtin_amdgcn_sched_barrier(0);
691 if constexpr(HasMainLoop)
696 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
697 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
699 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
700 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
703 blockwise_gemm_func();
705 b_scale_struct.template GlobalLoad<0>((i + 2) % num_loop_per_scale == 0);
710 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
711 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
714 }
while(i < (num_loop - 1));
721 blockwise_gemm_func();
735 Number<KPack / A_KRow * MRepeat>{},
749 Number<KPack / B_KRow * NRepeat>{},
778 using Base::c_thread_desc_;
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS
Definition ck.hpp:209
__host__ __device__ constexpr T max(T x)
Definition utility/math.hpp:84
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Full
Definition blkgemmpipe_scheduler.hpp:49
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
@ Interwave
Definition blkgemmpipe_scheduler.hpp:27
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__device__ void block_sync_lds()
Definition synchronization.hpp:16
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::HotLoopInstList ck::BlockwiseGemmWmmaops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerWmma, NPerWmma, wmma_gemm.wmma_instr.k_per_wmma > HotLoopInstList
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:70
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::GetCThreadBuffer __host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:166
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_block_desc_k0_n0_n1_n2_k1 static constexpr BWmmaTileDesc b_block_desc_k0_n0_n1_n2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:337
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I1 static constexpr auto I1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:37
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:57
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::CalculateCThreadOriginDataIndex static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >)
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:217
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::CalculateBThreadOriginDataIndex static __device__ auto CalculateBThreadOriginDataIndex()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:198
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BlockwiseGemmWmmaops_pipeline_base __host__ __device__ BlockwiseGemmWmmaops_pipeline_base(Tuple6 a_origin=CalculateAThreadOriginDataIndex(), Tuple6 b_origin=CalculateBThreadOriginDataIndex())
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:264
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_block_desc_k0_m0_m1_m2_k1 static constexpr AWmmaTileDesc a_block_desc_k0_m0_m1_m2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:336
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::wmma_gemm static constexpr auto wmma_gemm
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:63
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_KRow static constexpr index_t B_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:54
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:36
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_K1 static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:58
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:44
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::WmmaK static constexpr auto WmmaK
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:68
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:66
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::CalculateAThreadOriginDataIndex static __device__ auto CalculateAThreadOriginDataIndex()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:180
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_KRow static constexpr index_t A_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:53
ck::BlockwiseGemmWmmaops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerWmma, NPerWmma, wmma_gemm.wmma_instr.k_per_wmma >::A_LDS_Write_Inst_Num static constexpr index_t A_LDS_Write_Inst_Num
Definition blockwise_gemm_pipeline_wmmaops.hpp:39
ck::BlockwiseGemmWmmaops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerWmma, NPerWmma, wmma_gemm.wmma_instr.k_per_wmma >::B_Buffer_Load_Inst_Num static constexpr index_t B_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_wmmaops.hpp:36
ck::BlockwiseGemmWmmaops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerWmma, NPerWmma, wmma_gemm.wmma_instr.k_per_wmma >::A_Buffer_Load_Inst_Num static constexpr index_t A_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_wmmaops.hpp:34
ck::BlockwiseGemmWmmaops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerWmma, NPerWmma, wmma_gemm.wmma_instr.k_per_wmma >::B_LDS_Write_Inst_Num static constexpr index_t B_LDS_Write_Inst_Num
Definition blockwise_gemm_pipeline_wmmaops.hpp:41
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:776
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:489
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_block_desc_k0_n0_n1_n2_k1 static constexpr BWmmaTileDesc b_block_desc_k0_n0_n1_n2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:337
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BlockHasHotloop static bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:493
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:57
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:726
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, BScaleStruct &b_scale_struct, index_t num_loop, index_t num_loop_per_scale) const
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:517
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_block_desc_k0_m0_m1_m2_k1 static constexpr AWmmaTileDesc a_block_desc_k0_m0_m1_m2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:336
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::wmma_gemm static constexpr auto wmma_gemm
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:63
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_KRow static constexpr index_t B_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:54
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:740
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BlockLoopTailNum static TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:495
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:36
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_K1 static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:58
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::NumKClusters static constexpr index_t NumKClusters
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:486
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:491
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::AThreadCopy ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeTypeA, decltype(a_block_desc_k0_m0_m1_m2_k1), decltype(a_thread_desc_), Sequence< KPack/A_K1/A_KRow, 1, 1, 1, 1, A_K1 >, Sequence< 0, 1, 2, 3, 4, 5 >, 5, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:754
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::Base BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC > Base
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:443
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::WmmaK static constexpr auto WmmaK
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:68
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:66
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BThreadCopy ThreadwiseTensorSliceTransfer_v4< BDataType, ComputeTypeB, decltype(b_block_desc_k0_n0_n1_n2_k1), decltype(b_thread_desc_), Sequence< KPack/B_K1/B_KRow, 1, 1, 1, 1, B_K1 >, Sequence< 0, 1, 2, 3, 4, 5 >, 5, B_K1, B_K1 > BThreadCopy
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:765
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_KRow static constexpr index_t A_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:53
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:369
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::KRepeatPerCluster static constexpr index_t KRepeatPerCluster
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:487
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:777
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:490
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_block_desc_k0_n0_n1_n2_k1 static constexpr BWmmaTileDesc b_block_desc_k0_n0_n1_n2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:337
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:57
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BlockHasHotloop static bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:148
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, BScaleStruct &b_scale_struct, index_t num_loop, index_t num_loop_per_scale) const
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:172
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BThreadCopy ThreadwiseTensorSliceTransfer_v4< BDataType, ComputeTypeB, decltype(b_block_desc_k0_n0_n1_n2_k1), decltype(b_thread_desc_), Sequence< KPack/B_K1/B_KRow, 1, 1, 1, 1, B_K1 >, Sequence< 0, 1, 2, 3, 4, 5 >, 5, B_K1, B_K1 > BThreadCopy
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:368
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_block_desc_k0_m0_m1_m2_k1 static constexpr AWmmaTileDesc a_block_desc_k0_m0_m1_m2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:336
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::wmma_gemm static constexpr auto wmma_gemm
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:63
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::AThreadCopy ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeTypeA, decltype(a_block_desc_k0_m0_m1_m2_k1), decltype(a_thread_desc_), Sequence< KPack/A_K1/A_KRow, 1, 1, 1, 1, A_K1 >, Sequence< 0, 1, 2, 3, 4, 5 >, 5, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:357
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_KRow static constexpr index_t B_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:54
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:350
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:354
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:146
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:36
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_K1 static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:58
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:380
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:379
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::WmmaK static constexpr auto WmmaK
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:68
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:145
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::Base BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC > Base
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:99
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::BlockLoopTailNum static TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:150
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_KRow static constexpr index_t A_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:53
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:369
ck::BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:144
Definition blockwise_gemm_pipeline_wmmaops_v1.hpp:37
Definition utility/sequence.hpp:43
Definition threadwise_tensor_slice_transfer.hpp:1260
static constexpr value_type value
Definition utility/integral_constant.hpp:13
Definition functional2.hpp:33
Definition dtype_vector.hpp:10