DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer > Struct Template Reference#
Classes |
Public Types |
Public Member Functions |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer > Struct Template Reference
#include <device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp>
Inheritance diagram for ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >:
Classes | |
| struct | ComputeBasePtrOfStridedBatch |
| struct | RawArg |
| struct | SelfAttnArg |
| struct | CrossAttnArg |
| struct | Argument |
| struct | SelfAttnInvoker |
| struct | CrossAttnInvoker |
| struct | Invoker |
Public Types | |
| using | DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle |
| using | Transform |
| using | AGridDesc = decltype(MakeAGridDescriptor({}, {})) |
| using | B0GridDesc = decltype(MakeB0GridDescriptor({}, {})) |
| using | B1GridDesc = decltype(MakeB1GridDescriptor({}, {})) |
| using | CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})) |
| using | AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})) |
| using | B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})) |
| using | B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})) |
| using | CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})) |
| using | C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())> |
| using | GridwiseOp |
Public Member Functions | |
| bool | IsSupportedArgument (const BaseArgument *p_arg) override |
| std::unique_ptr< BaseArgument > | MakeArgumentPointer (const void *p_a, const void *p_b0, const void *p_b1, void *p_c, const std::array< void *, NumAcc0Bias > p_acc0_biases, const std::array< void *, NumAcc1Bias > p_acc1_biases, const std::vector< index_t > &a_gs_ms_ks_lengths, const std::vector< index_t > &a_gs_ms_ks_strides, const std::vector< index_t > &b0_gs_ls_ks_lengths, const std::vector< index_t > &b0_gs_ls_ks_strides, const std::vector< index_t > &b1_gs_ns_ls_lengths, const std::vector< index_t > &b1_gs_ns_ls_strides, const std::vector< index_t > &c_gs_ms_ns_lengths, const std::vector< index_t > &c_gs_ms_ns_strides, const std::array< std::vector< ck::index_t >, NumAcc0Bias > acc0_biases_gs_ms_ls_lengths, const std::array< std::vector< ck::index_t >, NumAcc0Bias > acc0_biases_gs_ms_ls_strides, const std::array< std::vector< ck::index_t >, NumAcc1Bias > acc1_biases_gs_ms_ns_lengths, const std::array< std::vector< ck::index_t >, NumAcc1Bias > acc1_biases_gs_ms_ns_strides, AElementwiseOperation a_element_op, B0ElementwiseOperation b0_element_op, AccElementwiseOperation acc_element_op, B1ElementwiseOperation b1_element_op, CElementwiseOperation c_element_op) override |
| std::unique_ptr< BaseInvoker > | MakeInvokerPointer () override |
| std::string | GetTypeString () const override |
| Public Member Functions inherited from ck::tensor_operation::device::BaseOperator | |
| BaseOperator ()=default | |
| BaseOperator (const BaseOperator &)=default | |
| BaseOperator & | operator= (const BaseOperator &)=default |
| virtual std::string | GetInstanceString () const |
| virtual std::string | GetTypeIdName () const |
| virtual std::optional< std::string > | GetObjectName () const |
| virtual std::optional< std::string > | GetTemplateInfo () const |
| virtual std::string | GetTypeIdHashCode () const |
| virtual size_t | GetWorkSpaceSize (const BaseArgument *) const |
| virtual void | SetWorkSpacePointer (BaseArgument *p_arg, void *p_workspace, const StreamConfig &=StreamConfig{}) const |
| virtual | ~BaseOperator () |
Static Public Member Functions | |
| __host__ static __device__ auto | MakeAGridDescriptor (const std::array< index_t, NumDimG+NumDimM+NumDimN > &a_gs_ms_ks_lengths_vec, const std::array< index_t, NumDimG+NumDimM+NumDimN > &a_gs_ms_ks_strides_vec) |
| __host__ static __device__ auto | MakeB0GridDescriptor (const std::array< index_t, NumDimG+NumDimM+NumDimN > &b0_gs_ls_ks_lengths_vec, const std::array< index_t, NumDimG+NumDimM+NumDimN > &b0_gs_ls_ks_strides_vec) |
| __host__ static __device__ auto | MakeB1GridDescriptor (const std::array< index_t, NumDimG+NumDimM+NumDimN > &b1_gs_ns_ls_lengths_vec, const std::array< index_t, NumDimG+NumDimM+NumDimN > &b1_gs_ns_ls_strides_vec) |
| __host__ __device__ static constexpr auto | make_MaskOutPredicate () |
| static auto | MakeArgument (const ADataType *p_a, const B0DataType *p_b0, const B1DataType *p_b1, CDataType *p_c, index_t M, index_t N, index_t K, index_t O, index_t G0, index_t G1, float alpha, bool input_permute, bool output_permute) |
| static bool | IsSupportedArgument (const RawArg &arg) |
| static auto | MakeSelfAttnArgument (const ADataType *p_qkv_grid, CDataType *p_out_grid, index_t batch_size, index_t sequence_length, index_t head_count, index_t head_size, float alpha) |
| static auto | MakeCrossAttnArgument (const ADataType *p_q_grid, const B0DataType *p_kv_grid, CDataType *p_out_grid, index_t batch_size, index_t q_sequence_length, index_t kv_sequence_length, index_t head_count, index_t head_size, float alpha) |
| static auto | MakeSelfAttnInvoker () |
| static auto | MakeCrossAttnInvoker () |
| static constexpr bool | IsValidCompilationParameter () |
| static auto | MakeInvoker () |
Static Public Attributes | |
| static constexpr index_t | NumAcc0Bias = Acc0BiasDataType::Size() |
| static constexpr index_t | NumAcc1Bias = Acc1BiasDataType::Size() |
| static constexpr index_t | NumDimGemm0M = NumDimM |
| static constexpr index_t | NumDimGemm0N = NumDimL |
| static constexpr index_t | NumDimGemm0K = NumDimK |
| static constexpr index_t | NumDimGemm1M = NumDimM |
| static constexpr index_t | NumDimGemm1N = NumDimN |
| static constexpr index_t | NumDimGemm1K = NumDimL |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr auto | I4 = Number<4>{} |
| static constexpr auto | I5 = Number<5>{} |
| static constexpr auto | I6 = Number<6>{} |
| static constexpr auto | WmmaK = 16 |
| static constexpr auto | MWaves = MPerBlock / (MRepeat * MPerWmma) |
| static constexpr auto | LWaves = LPerBlock / (LRepeat * LPerWmma) |
| static constexpr auto | NWaves = NPerBlock / (NRepeat * NPerWmma) |
| static constexpr auto | AEnableLds_auto = LWaves == 1 ? false : true |
| static constexpr auto | B0EnableLds_auto = MWaves == 1 ? false : true |
| static constexpr auto | B1EnableLds_auto = MWaves == 1 ? false : true |
| static constexpr auto | AEnableLds_manu = false |
| static constexpr auto | B0EnableLds_manu = true |
| static constexpr auto | B1EnableLds_manu = true |
| static constexpr auto | AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1) |
| static constexpr auto | B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch > 1) |
| static constexpr auto | B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch > 1) |
| Static Public Attributes inherited from ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc1BiasDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, MaskingSpec > | |
| static constexpr index_t | NumAcc0Bias |
| static constexpr index_t | NumAcc1Bias |
Member Typedef Documentation
◆ AGridDesc
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::AGridDesc = decltype(MakeAGridDescriptor({}, {})) |
◆ AGridDesc_G_M_K
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})) |
◆ B0GridDesc
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B0GridDesc = decltype(MakeB0GridDescriptor({}, {})) |
◆ B0GridDesc_G_L_K
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})) |
◆ B1GridDesc
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B1GridDesc = decltype(MakeB1GridDescriptor({}, {})) |
◆ B1GridDesc_G_N_L
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})) |
◆ C0MatrixMask
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())> |
◆ CGridDesc_G_M_N
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})) |
◆ CGridDesc_M_N
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})) |
◆ DeviceOp
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle |
◆ GridwiseOp
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::GridwiseOp |
◆ Transform
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
| using ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimL, NumDimK, NumDimN, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, GemmSpec, ASpec, B0Spec, B1Spec, CSpec, NumPrefetch, BlockSize, MPerBlock, LPerBlock, KPerBlock, AK1, BK1, NPerBlock, LTilePerBlock, L1, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, LoopSched, PipelineVer >::Transform |
Initial value:
GemmSpec,
ASpec,
B0Spec,
B1Spec,
CSpec>
Definition utility/sequence.hpp:43
Definition transform_contraction_to_gemm_arraybase.hpp:122
Member Function Documentation
◆ GetTypeString()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ IsSupportedArgument() [1/2]
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ IsSupportedArgument() [2/2]
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ IsValidCompilationParameter()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ make_MaskOutPredicate()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestaticconstexpr |
◆ MakeAGridDescriptor()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeArgument()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeArgumentPointer()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlineoverridevirtual |
◆ MakeB0GridDescriptor()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeB1GridDescriptor()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeCrossAttnArgument()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeCrossAttnInvoker()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeInvoker()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeInvokerPointer()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlineoverridevirtual |
◆ MakeSelfAttnArgument()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
◆ MakeSelfAttnInvoker()
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
inlinestatic |
Member Data Documentation
◆ AEnableLds
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ AEnableLds_auto
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ AEnableLds_manu
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B0EnableLds
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B0EnableLds_auto
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B0EnableLds_manu
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B1EnableLds
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B1EnableLds_auto
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ B1EnableLds_manu
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I0
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I1
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I2
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I3
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I4
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I5
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ I6
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ LWaves
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ MWaves
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumAcc0Bias
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumAcc1Bias
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm0K
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm0M
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm0N
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm1K
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm1M
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NumDimGemm1N
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ NWaves
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
◆ WmmaK
template<index_t NumDimG, index_t NumDimM, index_t NumDimL, index_t NumDimK, index_t NumDimN, typename ADataType, typename B0DataType, typename B1DataType, typename CDataType, typename Acc0BiasDataType, typename Acc0DataType, typename Acc1BiasDataType, typename Acc1DataType, typename CShuffleDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization B0Spec, TensorSpecialization B1Spec, TensorSpecialization CSpec, ck::index_t NumPrefetch, ck::index_t BlockSize, ck::index_t MPerBlock, ck::index_t LPerBlock, ck::index_t KPerBlock, ck::index_t AK1, ck::index_t BK1, ck::index_t NPerBlock, ck::index_t LTilePerBlock, ck::index_t L1, ck::index_t MPerWmma, ck::index_t LPerWmma, ck::index_t NPerWmma, ck::index_t MRepeat, ck::index_t LRepeat, ck::index_t NRepeat, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, ck::index_t ABlockTransferSrcVectorDim, ck::index_t ABlockTransferSrcScalarPerVector, ck::index_t ABlockTransferDstScalarPerVector_K1, bool ABlockLdsAddExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, ck::index_t B0BlockTransferSrcVectorDim, ck::index_t B0BlockTransferSrcScalarPerVector, ck::index_t B0BlockTransferDstScalarPerVector_K1, bool B0BlockLdsAddExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, ck::index_t B1BlockTransferSrcVectorDim, ck::index_t B1BlockTransferSrcScalarPerVector, ck::index_t B1BlockTransferDstScalarPerVector_L1, bool B1BlockLdsAddExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, ck::LoopScheduler LoopSched = make_default_loop_scheduler(), ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
|
staticconstexpr |
The documentation for this struct was generated from the following file: