提交 c7a99098 编写于 作者: M Megvii Engine Team

feat(cuda): add int4 ptx 256x64 mma kernel

GitOrigin-RevId: 8f7475b0f6f60c13f7cc8e571c81822d72b6f689
上级 cf3ca1e9
#pragma once
#include "./base.cuh"
#define TX 128
#define TY 1
#define BM 64
#define BN 256
#define BK 128
#define mma_m 16
#define mma_n 8
#define mma_k 64
#define reg_m 8
#define reg_n 8
#define packed_channel 64
#define BKd32 (BK / 32)
#define reg_md4 (reg_m >> 2)
#define WARPS (TX / 32)
#define cache_per_warp 128
#define reg_nd4 (reg_n >> 2)
#define ldg_src (BN * BK / (16 * TX))
#define ldg_filter (BM * BK / (16 * TX))
#define ldg_width 16
// vim: syntax=cpp.doxygen
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册