Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
e661ae90
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
e661ae90
编写于
5月 21, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn/cuda): add base class for cutlass uint4 and int4 algos
GitOrigin-RevId: a4d42f032c7e53f2966016092ba52c091575be77
上级
319436dd
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
366 addition
and
364 deletion
+366
-364
dnn/src/cuda/conv_bias/algo.h
dnn/src/cuda/conv_bias/algo.h
+81
-49
dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp
...rc/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp
+54
-153
dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
...rc/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
+149
-0
dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp
...c/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp
+81
-162
dnn/src/cuda/conv_bias/opr_impl.h
dnn/src/cuda/conv_bias/opr_impl.h
+1
-0
未找到文件。
dnn/src/cuda/conv_bias/algo.h
浏览文件 @
e661ae90
...
...
@@ -765,7 +765,7 @@ private:
std
::
string
m_name
;
};
class
ConvBiasForwardImpl
::
AlgoInt4
Int4NCHW64IMMAImplicitGemm
final
class
ConvBiasForwardImpl
::
AlgoInt4
NCHW64IMMAImplicitGemmBase
:
public
AlgoBase
{
public:
struct
AlgoParam
{
...
...
@@ -776,89 +776,121 @@ public:
int
warp_n
;
int
warp_k
;
};
AlgoInt4NCHW64IMMAImplicitGemmBase
(
AlgoParam
algo_param
)
:
m_algo_param
(
algo_param
)
{}
AlgoAttribute
attribute
()
const
override
{
return
AlgoAttribute
::
REPRODUCIBLE
;
}
const
char
*
name
()
const
override
{
return
m_name
.
c_str
();
}
std
::
string
param
()
const
override
;
bool
is_available
(
const
SizeArgs
&
args
)
const
override
;
void
exec
(
const
ExecArgs
&
args
)
const
override
;
std
::
string
to_string
(
AlgoParam
algo_param
);
protected:
virtual
DTypeEnum
src_dtype
()
const
=
0
;
// return filter_ptr, bias_ptr
virtual
std
::
tuple
<
void
*
,
void
*>
prepare_filter_bias
(
const
ExecArgs
&
args
)
const
=
0
;
// return alpha, beta, gamma, delta, theta
virtual
std
::
tuple
<
float
,
float
,
float
,
float
,
float
>
get_constants
(
const
ExecArgs
&
args
)
const
=
0
;
virtual
void
do_exec
(
const
ExecArgs
&
args
,
void
*
filter_ptr
,
void
*
bias_ptr
,
void
*
z_ptr
,
convolution
::
ConvParam
kern_param
,
uint32_t
nonlinear_mode
,
float
alpha
,
float
beta
,
float
gamma
,
float
delta
,
float
theta
,
cudaStream_t
stream
)
const
=
0
;
void
reorder_filter
(
const
ExecArgs
&
args
,
void
*
reordered_filter
)
const
;
std
::
string
m_name
;
AlgoParam
m_algo_param
;
};
class
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
final
:
public
AlgoInt4NCHW64IMMAImplicitGemmBase
{
public:
using
Base
=
AlgoInt4NCHW64IMMAImplicitGemmBase
;
using
AlgoParam
=
Base
::
AlgoParam
;
AlgoInt4Int4NCHW64IMMAImplicitGemm
(
AlgoParam
algo_param
)
:
m_algo_param
{
algo_param
}
{
:
Base
{
algo_param
}
{
m_name
=
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
ssprintf
(
"INT4_INT4_NCHW64_IMMA_IMPLICIT_GEMM_%s"
,
to_string
(
m_algo_param
).
c_str
()),
ConvBias
::
DirectParam
{});
}
bool
is_available
(
const
SizeArgs
&
args
)
const
override
;
size_t
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
void
exec
(
const
ExecArgs
&
args
)
const
override
;
const
char
*
name
()
const
override
{
return
m_name
.
c_str
();
}
AlgoAttribute
attribute
()
const
override
{
return
AlgoAttribute
::
REPRODUCIBLE
;
}
static
std
::
string
to_string
(
AlgoParam
algo_param
);
size_t
get_preprocess_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
SmallVector
<
TensorLayout
>
deduce_preprocessed_filter_layout
(
const
SizeArgs
&
args
)
const
override
;
void
exec_preprocess
(
const
ExecArgs
&
args
)
const
override
;
MEGDNN_DECL_ALGO_TYPE
(
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4
)
std
::
string
param
()
const
override
{
std
::
string
ret
;
serialize_write_pod
(
m_algo_param
,
ret
);
return
ret
;
}
MEGDNN_DECL_ALGO_TYPE
(
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4
)
private:
WorkspaceBundle
get_workspace_bundle
(
dt_byte
*
raw_ptr
,
const
SizeArgs
&
args
)
const
;
DTypeEnum
src_dtype
()
const
override
{
return
DTypeEnum
::
QuantizedS4
;
}
AlgoParam
m_algo_param
;
std
::
string
m_name
;
std
::
tuple
<
void
*
,
void
*>
prepare_filter_bias
(
const
ExecArgs
&
args
)
const
override
;
std
::
tuple
<
float
,
float
,
float
,
float
,
float
>
get_constants
(
const
ExecArgs
&
args
)
const
override
;
void
do_exec
(
const
ExecArgs
&
args
,
void
*
filter_ptr
,
void
*
bias_ptr
,
void
*
z_ptr
,
convolution
::
ConvParam
kern_param
,
uint32_t
nonlinear_mode
,
float
alpha
,
float
beta
,
float
gamma
,
float
delta
,
float
theta
,
cudaStream_t
stream
)
const
override
;
};
class
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
final
:
public
AlgoBase
{
:
public
Algo
Int4NCHW64IMMAImplicitGemm
Base
{
public:
struct
AlgoParam
{
int
threadblock_m
;
int
threadblock_n
;
int
threadblock_k
;
int
warp_m
;
int
warp_n
;
int
warp_k
;
};
using
Base
=
AlgoInt4NCHW64IMMAImplicitGemmBase
;
using
AlgoParam
=
Base
::
AlgoParam
;
AlgoUInt4Int4NCHW64IMMAImplicitGemm
(
AlgoParam
algo_param
)
:
m_algo_param
{
algo_param
}
{
:
Base
{
algo_param
}
{
m_name
=
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
ssprintf
(
"UINT4_INT4_NCHW64_IMMA_IMPLICIT_GEMM_%s"
,
to_string
(
m_algo_param
).
c_str
()),
ConvBias
::
DirectParam
{});
}
bool
is_available
(
const
SizeArgs
&
args
)
const
override
;
size_t
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
void
exec
(
const
ExecArgs
&
args
)
const
override
;
const
char
*
name
()
const
override
{
return
m_name
.
c_str
();
}
AlgoAttribute
attribute
()
const
override
{
return
AlgoAttribute
::
REPRODUCIBLE
;
}
static
std
::
string
to_string
(
AlgoParam
algo_param
);
size_t
get_preprocess_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
SmallVector
<
TensorLayout
>
deduce_preprocessed_filter_layout
(
const
SizeArgs
&
args
)
const
override
;
void
exec_preprocess
(
const
ExecArgs
&
args
)
const
override
;
MEGDNN_DECL_ALGO_TYPE
(
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4
)
std
::
string
param
()
const
override
{
std
::
string
ret
;
serialize_write_pod
(
m_algo_param
,
ret
);
return
ret
;
}
MEGDNN_DECL_ALGO_TYPE
(
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4
)
private:
WorkspaceBundle
get_workspace_bundle
(
dt_byte
*
raw_ptr
,
const
SizeArgs
&
args
)
const
;
void
reorder_filter_bias
(
const
ExecArgs
&
args
,
void
*
reduce_filter
,
void
*
reordered_filter
,
void
*
reordered_bias
)
const
;
AlgoParam
m_algo_param
;
std
::
string
m_name
;
DTypeEnum
src_dtype
()
const
override
{
return
DTypeEnum
::
Quantized4Asymm
;
}
std
::
tuple
<
void
*
,
void
*>
prepare_filter_bias
(
const
ExecArgs
&
args
)
const
override
;
std
::
tuple
<
float
,
float
,
float
,
float
,
float
>
get_constants
(
const
ExecArgs
&
args
)
const
override
;
void
do_exec
(
const
ExecArgs
&
args
,
void
*
filter_ptr
,
void
*
bias_ptr
,
void
*
z_ptr
,
convolution
::
ConvParam
kern_param
,
uint32_t
nonlinear_mode
,
float
alpha
,
float
beta
,
float
gamma
,
float
delta
,
float
theta
,
cudaStream_t
stream
)
const
override
;
void
update_bias
(
const
ExecArgs
&
args
,
void
*
updated_bias
,
void
*
reduce_filter_ptr
,
void
*
reduce_workspace
)
const
;
};
#endif
...
...
dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nchw64_imma.cpp
浏览文件 @
e661ae90
...
...
@@ -11,117 +11,59 @@
*/
#include "./algo.h"
#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/utils.h"
using
namespace
megdnn
;
using
namespace
cuda
;
using
namespace
convolution
;
#if CUDA_VERSION >= 10020
bool
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
is_available
(
size_t
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
if
(
args
.
bias_layout
->
ndim
<=
0
)
return
false
;
using
Param
=
param
::
ConvBias
;
using
Format
=
Param
::
Format
;
using
Sparse
=
Param
::
Sparse
;
using
Mode
=
Param
::
Mode
;
using
NonlineMode
=
megdnn
::
param
::
ConvBias
::
NonlineMode
;
auto
&&
param
=
args
.
opr
->
param
();
if
(
!
check_bias_share_in_channel
(
*
(
args
.
bias_layout
),
param
.
format
))
return
false
;
if
(
param
.
format
!=
Format
::
NCHW64
||
param
.
sparse
!=
Sparse
::
DENSE
||
param
.
mode
!=
Mode
::
CROSS_CORRELATION
)
return
false
;
if
(
param
.
nonlineMode
!=
NonlineMode
::
IDENTITY
&&
param
.
nonlineMode
!=
NonlineMode
::
RELU
&&
param
.
nonlineMode
!=
NonlineMode
::
H_SWISH
)
return
false
;
if
(
args
.
src_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS4
||
args
.
filter_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS4
||
args
.
bias_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS32
||
args
.
dst_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS4
)
return
false
;
if
(
!
is_compute_capability_required
(
7
,
5
))
return
false
;
return
true
;
}
WorkspaceBundle
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
get_workspace_bundle
(
dt_byte
*
raw_ptr
,
const
SizeArgs
&
args
)
const
{
if
(
args
.
preprocessed_filter
)
{
return
WorkspaceBundle
{
raw_ptr
,
{}}
;
return
0
;
}
else
{
size_t
ws_filter
=
args
.
filter_layout
->
span
().
dist_byte
();
return
WorkspaceBundle
{
raw_ptr
,
{
ws_filter
}};
return
args
.
filter_layout
->
span
().
dist_byte
();
}
}
size_t
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
return
get_workspace_bundle
(
nullptr
,
args
).
total_size_in_bytes
();
size_t
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
get_preprocess_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
return
0
;
}
SmallVector
<
TensorLayout
>
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
deduce_preprocessed_filter_layout
(
const
SizeArgs
&
args
)
const
{
return
{
args
.
filter_layout
->
collapse_contiguous
()};
}
void
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
exec
(
void
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
exec_preprocess
(
const
ExecArgs
&
args
)
const
{
megdnn_assert
(
args
.
preprocessed_filter
->
tensors
.
size
()
==
1
);
void
*
filter_ptr
=
args
.
preprocessed_filter
->
tensors
[
0
].
raw_ptr
;
reorder_filter
(
args
,
filter_ptr
);
}
std
::
tuple
<
void
*
,
void
*>
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
prepare_filter_bias
(
const
ExecArgs
&
args
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
fm
=
args
.
filter_meta
;
size_t
n
=
args
.
src_layout
->
operator
[](
0
),
ci
=
args
.
src_layout
->
operator
[](
1
)
*
64
,
hi
=
args
.
src_layout
->
operator
[](
2
),
wi
=
args
.
src_layout
->
operator
[](
3
);
size_t
co
=
args
.
dst_layout
->
operator
[](
1
)
*
64
,
ho
=
args
.
dst_layout
->
operator
[](
2
),
wo
=
args
.
dst_layout
->
operator
[](
3
);
UNPACK_CONV_PARAMETER
(
fm
,
param
);
MARK_USED_VAR
auto
&&
stream
=
cuda_stream
(
args
.
opr
->
handle
());
int8_t
*
filter_ptr
=
nullptr
;
if
(
args
.
preprocessed_filter
==
nullptr
)
{
filter_ptr
=
reinterpret_cast
<
int8_t
*>
(
args
.
workspace
.
raw_ptr
);
// reformat filter from nchw64 to chwn64
TensorLayout
src
{{
co
,
ci
/
64
,
fh
,
fw
,
64
},
dtype
::
QuantizedS4
()};
src
.
init_contiguous_stride
();
TensorLayout
dst
=
src
;
dst
.
stride
[
0
]
=
64
;
dst
.
stride
[
1
]
=
co
*
fh
*
fw
*
64
;
dst
.
stride
[
2
]
=
co
*
fw
*
64
;
dst
.
stride
[
3
]
=
co
*
64
;
dst
.
stride
[
4
]
=
1
;
TensorND
ts_src
,
ts_dst
;
ts_src
.
raw_ptr
=
args
.
filter_tensor
->
raw_ptr
;
ts_src
.
layout
=
src
;
ts_dst
.
raw_ptr
=
args
.
workspace
.
raw_ptr
;
ts_dst
.
layout
=
dst
;
auto
&&
transpose
=
args
.
opr
->
handle
()
->
create_operator
<
RelayoutForward
>
();
transpose
->
exec
(
ts_src
,
ts_dst
);
void
*
filter_ptr
=
nullptr
;
if
(
args
.
preprocessed_filter
)
{
megdnn_assert
(
args
.
preprocessed_filter
->
tensors
.
size
()
==
1
);
filter_ptr
=
args
.
preprocessed_filter
->
tensors
[
0
].
raw_ptr
;
}
else
{
filter_ptr
=
reinterpret_cast
<
int8_t
*>
(
args
.
preprocessed_filter
->
tensors
[
0
].
raw
_ptr
);
filter_ptr
=
reinterpret_cast
<
void
*>
(
args
.
workspace
.
raw_ptr
);
reorder_filter
(
args
,
filter
_ptr
);
}
void
*
bias_ptr
=
args
.
bias_tensor
->
raw_ptr
;
return
{
filter_ptr
,
bias_ptr
};
}
ConvParam
kern_param
;
kern_param
.
n
=
n
,
kern_param
.
co
=
co
,
kern_param
.
ci
=
ci
,
kern_param
.
hi
=
hi
,
kern_param
.
wi
=
wi
,
kern_param
.
ho
=
ho
,
kern_param
.
wo
=
wo
,
kern_param
.
ph
=
ph
,
kern_param
.
pw
=
pw
,
kern_param
.
sh
=
sh
,
kern_param
.
sw
=
sw
,
kern_param
.
fh
=
fh
,
kern_param
.
fw
=
fw
;
std
::
tuple
<
float
,
float
,
float
,
float
,
float
>
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
get_constants
(
const
ExecArgs
&
args
)
const
{
float
src_scale
=
args
.
src_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
,
filter_scale
=
args
.
filter_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
,
...
...
@@ -130,78 +72,37 @@ void ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm::exec(
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
float
alpha
=
src_scale
*
filter_scale
/
dst_scale
,
beta
=
bias_scale
/
dst_scale
;
beta
=
bias_scale
/
dst_scale
,
gamma
=
0.
f
,
delta
=
0.
f
,
theta
=
0.
f
;
int8_t
*
z_dev_ptr
=
nullptr
;
float
gamma
=
0.
f
;
if
(
args
.
z_layout
->
ndim
>
0
)
{
z_dev_ptr
=
reinterpret_cast
<
int8_t
*>
(
args
.
z_tensor
->
raw_ptr
);
float
z_scale
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
gamma
=
z_scale
/
dst_scale
;
}
uint32_t
nonlinear_mode
=
static_cast
<
uint32_t
>
(
param
.
nonlineMode
);
cutlass_wrapper
::
do_conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64
<
true
>
(
reinterpret_cast
<
int8_t
*>
(
args
.
src_tensor
->
raw_ptr
),
filter_ptr
,
args
.
bias_tensor
->
compatible_ptr
<
int32_t
>
(),
z_dev_ptr
,
reinterpret_cast
<
int8_t
*>
(
args
.
dst_tensor
->
raw_ptr
),
nullptr
,
kern_param
,
nonlinear_mode
,
alpha
,
beta
,
gamma
,
dst_scale
,
cutlass_wrapper
::
GemmCoord
{
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
},
cutlass_wrapper
::
GemmCoord
{
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
},
stream
);
return
{
alpha
,
beta
,
gamma
,
delta
,
theta
};
}
std
::
string
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
to_string
(
AlgoParam
algo_param
)
{
return
ssprintf
(
"%uX%uX%u_%uX%uX%u"
,
algo_param
.
threadblock_m
,
algo_param
.
threadblock_n
,
algo_param
.
threadblock_k
,
algo_param
.
warp_m
,
algo_param
.
warp_n
,
algo_param
.
warp_k
);
}
void
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
do_exec
(
const
ExecArgs
&
args
,
void
*
filter_ptr
,
void
*
bias_ptr
,
void
*
z_ptr
,
ConvParam
kern_param
,
uint32_t
nonlinear_mode
,
float
alpha
,
float
beta
,
float
gamma
,
float
delta
,
float
theta
,
cudaStream_t
stream
)
const
{
float
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
size_t
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
get_preprocess_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
return
0
_z
;
}
cutlass_wrapper
::
GemmCoord
threadblock_shape
{
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
};
SmallVector
<
TensorLayout
>
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
deduce_preprocessed_filter_layout
(
const
SizeArgs
&
args
)
const
{
return
{
args
.
filter_layout
->
collapse_contiguous
()};
}
cutlass_wrapper
::
GemmCoord
warp_shape
{
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
};
void
ConvBiasForwardImpl
::
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
exec_preprocess
(
const
ExecArgs
&
args
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
fm
=
args
.
filter_meta
;
size_t
n
=
args
.
src_layout
->
operator
[](
0
),
ci
=
args
.
src_layout
->
operator
[](
1
)
*
64
,
hi
=
args
.
src_layout
->
operator
[](
2
),
wi
=
args
.
src_layout
->
operator
[](
3
);
size_t
co
=
args
.
dst_layout
->
operator
[](
1
)
*
64
,
ho
=
args
.
dst_layout
->
operator
[](
2
),
wo
=
args
.
dst_layout
->
operator
[](
3
);
UNPACK_CONV_PARAMETER
(
fm
,
param
);
MARK_USED_VAR
TensorLayout
src
{{
co
,
ci
/
64
,
fh
,
fw
,
64
},
dtype
::
QuantizedS4
()};
src
.
init_contiguous_stride
();
TensorLayout
dst
=
src
;
dst
.
stride
[
0
]
=
64
;
dst
.
stride
[
1
]
=
co
*
fh
*
fw
*
64
;
dst
.
stride
[
2
]
=
co
*
fw
*
64
;
dst
.
stride
[
3
]
=
co
*
64
;
dst
.
stride
[
4
]
=
1
;
TensorND
ts_src
,
ts_dst
;
ts_src
.
raw_ptr
=
args
.
filter_tensor
->
raw_ptr
;
ts_src
.
layout
=
src
;
ts_dst
.
raw_ptr
=
args
.
preprocessed_filter
->
tensors
[
0
].
raw_ptr
;
ts_dst
.
layout
=
dst
;
auto
&&
transpose
=
args
.
opr
->
handle
()
->
create_operator
<
RelayoutForward
>
();
transpose
->
exec
(
ts_src
,
ts_dst
);
cutlass_wrapper
::
do_conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64
<
true
>
(
reinterpret_cast
<
int8_t
*>
(
args
.
src_tensor
->
raw_ptr
),
reinterpret_cast
<
int8_t
*>
(
filter_ptr
),
reinterpret_cast
<
int32_t
*>
(
bias_ptr
),
reinterpret_cast
<
int8_t
*>
(
z_ptr
),
reinterpret_cast
<
int8_t
*>
(
args
.
dst_tensor
->
raw_ptr
),
nullptr
,
kern_param
,
nonlinear_mode
,
alpha
,
beta
,
gamma
,
dst_scale
,
threadblock_shape
,
warp_shape
,
stream
);
}
#endif
...
...
dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
0 → 100644
浏览文件 @
e661ae90
/**
* \file dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./algo.h"
#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh"
#include "src/cuda/conv_bias/reduce_filter.cuh"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/utils.h"
using
namespace
megdnn
;
using
namespace
cuda
;
using
namespace
convolution
;
#if CUDA_VERSION >= 10020
std
::
string
ConvBiasForwardImpl
::
AlgoInt4NCHW64IMMAImplicitGemmBase
::
param
()
const
{
std
::
string
ret
;
serialize_write_pod
(
m_algo_param
,
ret
);
return
ret
;
}
bool
ConvBiasForwardImpl
::
AlgoInt4NCHW64IMMAImplicitGemmBase
::
is_available
(
const
SizeArgs
&
args
)
const
{
if
(
args
.
bias_layout
->
ndim
<=
0
)
return
false
;
using
Param
=
param
::
ConvBias
;
using
Format
=
Param
::
Format
;
using
Sparse
=
Param
::
Sparse
;
using
Mode
=
Param
::
Mode
;
using
NonlineMode
=
megdnn
::
param
::
ConvBias
::
NonlineMode
;
auto
&&
param
=
args
.
opr
->
param
();
if
(
!
check_bias_share_in_channel
(
*
(
args
.
bias_layout
),
param
.
format
))
return
false
;
if
(
param
.
format
!=
Format
::
NCHW64
||
param
.
sparse
!=
Sparse
::
DENSE
||
param
.
mode
!=
Mode
::
CROSS_CORRELATION
)
return
false
;
if
(
param
.
nonlineMode
!=
NonlineMode
::
IDENTITY
&&
param
.
nonlineMode
!=
NonlineMode
::
RELU
&&
param
.
nonlineMode
!=
NonlineMode
::
H_SWISH
)
return
false
;
if
(
args
.
src_layout
->
dtype
.
enumv
()
!=
src_dtype
()
||
args
.
filter_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS4
||
args
.
bias_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS32
||
args
.
dst_layout
->
dtype
.
enumv
()
!=
src_dtype
())
return
false
;
if
(
!
is_compute_capability_required
(
7
,
5
))
return
false
;
return
true
;
}
void
ConvBiasForwardImpl
::
AlgoInt4NCHW64IMMAImplicitGemmBase
::
exec
(
const
ExecArgs
&
args
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
fm
=
args
.
filter_meta
;
size_t
n
=
args
.
src_layout
->
operator
[](
0
),
ci
=
args
.
src_layout
->
operator
[](
1
)
*
64
,
hi
=
args
.
src_layout
->
operator
[](
2
),
wi
=
args
.
src_layout
->
operator
[](
3
);
size_t
co
=
args
.
dst_layout
->
operator
[](
1
)
*
64
,
ho
=
args
.
dst_layout
->
operator
[](
2
),
wo
=
args
.
dst_layout
->
operator
[](
3
);
UNPACK_CONV_PARAMETER
(
fm
,
param
);
MARK_USED_VAR
void
*
filter_ptr
=
nullptr
;
void
*
bias_ptr
=
nullptr
;
void
*
z_ptr
=
nullptr
;
std
::
tie
(
filter_ptr
,
bias_ptr
)
=
prepare_filter_bias
(
args
);
if
(
args
.
z_layout
->
ndim
>
0
)
z_ptr
=
args
.
z_tensor
->
raw_ptr
;
float
alpha
,
beta
,
gamma
,
delta
,
theta
;
std
::
tie
(
alpha
,
beta
,
gamma
,
delta
,
theta
)
=
get_constants
(
args
);
ConvParam
kern_param
;
kern_param
.
n
=
n
,
kern_param
.
co
=
co
,
kern_param
.
ci
=
ci
,
kern_param
.
hi
=
hi
,
kern_param
.
wi
=
wi
,
kern_param
.
ho
=
ho
,
kern_param
.
wo
=
wo
,
kern_param
.
ph
=
ph
,
kern_param
.
pw
=
pw
,
kern_param
.
sh
=
sh
,
kern_param
.
sw
=
sw
,
kern_param
.
fh
=
fh
,
kern_param
.
fw
=
fw
;
uint32_t
nonlinear_mode
=
static_cast
<
uint32_t
>
(
param
.
nonlineMode
);
cudaStream_t
stream
=
cuda_stream
(
args
.
opr
->
handle
());
do_exec
(
args
,
filter_ptr
,
bias_ptr
,
z_ptr
,
kern_param
,
nonlinear_mode
,
alpha
,
beta
,
gamma
,
delta
,
theta
,
stream
);
}
std
::
string
ConvBiasForwardImpl
::
AlgoInt4NCHW64IMMAImplicitGemmBase
::
to_string
(
AlgoParam
algo_param
)
{
return
ssprintf
(
"%uX%uX%u_%uX%uX%u"
,
algo_param
.
threadblock_m
,
algo_param
.
threadblock_n
,
algo_param
.
threadblock_k
,
algo_param
.
warp_m
,
algo_param
.
warp_n
,
algo_param
.
warp_k
);
}
void
ConvBiasForwardImpl
::
AlgoInt4NCHW64IMMAImplicitGemmBase
::
reorder_filter
(
const
ExecArgs
&
args
,
void
*
reordered_filter
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
fm
=
args
.
filter_meta
;
size_t
n
=
args
.
src_layout
->
operator
[](
0
),
ci
=
args
.
src_layout
->
operator
[](
1
)
*
64
,
hi
=
args
.
src_layout
->
operator
[](
2
),
wi
=
args
.
src_layout
->
operator
[](
3
);
size_t
co
=
args
.
dst_layout
->
operator
[](
1
)
*
64
,
ho
=
args
.
dst_layout
->
operator
[](
2
),
wo
=
args
.
dst_layout
->
operator
[](
3
);
UNPACK_CONV_PARAMETER
(
fm
,
param
);
MARK_USED_VAR
;
// filter: KCRS64 => CRSK64
TensorLayout
src
{{
co
,
ci
/
64
,
fh
,
fw
,
64
},
dtype
::
QuantizedS4
()};
src
.
init_contiguous_stride
();
TensorLayout
dst
=
src
;
dst
.
stride
[
0
]
=
64
;
dst
.
stride
[
1
]
=
co
*
fh
*
fw
*
64
;
dst
.
stride
[
2
]
=
co
*
fw
*
64
;
dst
.
stride
[
3
]
=
co
*
64
;
dst
.
stride
[
4
]
=
1
;
TensorND
ts_src
,
ts_dst
;
ts_src
.
raw_ptr
=
args
.
filter_tensor
->
raw_ptr
;
ts_src
.
layout
=
src
;
ts_dst
.
raw_ptr
=
reordered_filter
;
ts_dst
.
layout
=
dst
;
auto
&&
transpose
=
args
.
opr
->
handle
()
->
create_operator
<
RelayoutForward
>
();
transpose
->
exec
(
ts_src
,
ts_dst
);
}
#endif
// vim: syntax=cpp.doxygen
dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nchw64_imma.cpp
浏览文件 @
e661ae90
...
...
@@ -11,10 +11,8 @@
*/
#include "./algo.h"
#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh"
#include "src/cuda/conv_bias/reduce_filter.cuh"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/utils.h"
using
namespace
megdnn
;
...
...
@@ -22,85 +20,60 @@ using namespace cuda;
using
namespace
convolution
;
#if CUDA_VERSION >= 10020
bool
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
is_available
(
const
SizeArgs
&
args
)
const
{
if
(
args
.
bias_layout
->
ndim
<=
0
)
return
false
;
using
Param
=
param
::
ConvBias
;
using
Format
=
Param
::
Format
;
using
Sparse
=
Param
::
Sparse
;
using
Mode
=
Param
::
Mode
;
using
NonlineMode
=
megdnn
::
param
::
ConvBias
::
NonlineMode
;
auto
&&
param
=
args
.
opr
->
param
();
if
(
!
check_bias_share_in_channel
(
*
(
args
.
bias_layout
),
param
.
format
))
return
false
;
if
(
param
.
format
!=
Format
::
NCHW64
||
param
.
sparse
!=
Sparse
::
DENSE
||
param
.
mode
!=
Mode
::
CROSS_CORRELATION
)
return
false
;
if
(
param
.
nonlineMode
!=
NonlineMode
::
IDENTITY
&&
param
.
nonlineMode
!=
NonlineMode
::
RELU
&&
param
.
nonlineMode
!=
NonlineMode
::
H_SWISH
)
return
false
;
if
(
args
.
src_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
Quantized4Asymm
||
args
.
filter_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS4
||
args
.
bias_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS32
||
args
.
dst_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
Quantized4Asymm
)
return
false
;
if
(
!
is_compute_capability_required
(
7
,
5
))
return
false
;
return
true
;
}
WorkspaceBundle
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
get_workspace_bundle
(
dt_byte
*
raw_ptr
,
const
SizeArgs
&
args
)
const
{
size_t
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
if
(
args
.
preprocessed_filter
)
{
return
WorkspaceBundle
{
raw_ptr
,
{}}
;
return
0
;
}
else
{
size_t
ws_filter
=
args
.
filter_layout
->
span
().
dist_byte
(),
ws_bias
=
args
.
bias_layout
->
span
().
dist_byte
(),
ws_reduce_filter
=
get_preprocess_workspace_in_bytes
(
args
);
return
WorkspaceBundle
{
raw_ptr
,
{
ws_filter
+
ws_bias
+
ws_reduce_filter
}};
return
ws_filter
+
ws_bias
+
ws_reduce_filter
;
}
}
size_t
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
return
get_workspace_bundle
(
nullptr
,
args
).
total_size_in_bytes
();
get_preprocess_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
size_t
co
=
args
.
filter_layout
->
operator
[](
0
),
ci
=
args
.
filter_layout
->
operator
[](
1
)
*
64
,
fh
=
args
.
filter_layout
->
operator
[](
2
),
fw
=
args
.
filter_layout
->
operator
[](
3
);
size_t
ws_size_reduce_filter
=
co
*
sizeof
(
int32_t
);
size_t
A
=
co
,
B
=
ci
*
fh
*
fw
/
8
,
C
=
1
;
ws_size_reduce_filter
+=
do_dispatch_reduce_workspace_in_bytes
(
A
,
B
,
C
);
return
ws_size_reduce_filter
;
}
void
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
exec
(
SmallVector
<
TensorLayout
>
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
deduce_preprocessed_filter_layout
(
const
SizeArgs
&
args
)
const
{
return
{
args
.
filter_layout
->
collapse_contiguous
(),
args
.
bias_layout
->
collapse_contiguous
()};
}
void
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
exec_preprocess
(
const
ExecArgs
&
args
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
fm
=
args
.
filter_meta
;
size_t
n
=
args
.
src_layout
->
operator
[](
0
),
ci
=
args
.
src_layout
->
operator
[](
1
)
*
64
,
hi
=
args
.
src_layout
->
operator
[](
2
),
wi
=
args
.
src_layout
->
operator
[](
3
);
size_t
co
=
args
.
dst_layout
->
operator
[](
1
)
*
64
,
ho
=
args
.
dst_layout
->
operator
[](
2
),
wo
=
args
.
dst_layout
->
operator
[](
3
);
UNPACK_CONV_PARAMETER
(
fm
,
param
);
MARK_USED_VAR
auto
&&
stream
=
cuda_stream
(
args
.
opr
->
handle
());
megdnn_assert
(
args
.
preprocessed_filter
->
tensors
.
size
()
==
2
);
void
*
filter_ptr
=
args
.
preprocessed_filter
->
tensors
[
0
].
raw_ptr
;
void
*
bias_ptr
=
args
.
preprocessed_filter
->
tensors
[
1
].
raw_ptr
;
void
*
reduce_filter_ptr
=
reinterpret_cast
<
void
*>
(
args
.
workspace
.
raw_ptr
);
void
*
reduce_workspace
=
reinterpret_cast
<
void
*>
(
args
.
workspace
.
raw_ptr
+
args
.
bias_layout
->
span
().
dist_byte
());
reorder_filter
(
args
,
filter_ptr
);
update_bias
(
args
,
bias_ptr
,
reduce_filter_ptr
,
reduce_workspace
);
}
std
::
tuple
<
void
*
,
void
*>
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
prepare_filter_bias
(
const
ExecArgs
&
args
)
const
{
void
*
filter_ptr
=
nullptr
;
void
*
bias_ptr
=
nullptr
;
if
(
args
.
preprocessed_filter
)
{
megdnn_assert
(
args
.
preprocessed_filter
->
tensors
.
size
()
==
2
);
filter_ptr
=
args
.
preprocessed_filter
->
tensors
[
0
].
raw_ptr
;
bias_ptr
=
args
.
preprocessed_filter
->
tensors
[
1
].
raw_ptr
;
return
{
filter_ptr
,
bias_ptr
};
}
else
{
// reorder filter and bias
filter_ptr
=
reinterpret_cast
<
void
*>
(
args
.
workspace
.
raw_ptr
);
bias_ptr
=
reinterpret_cast
<
void
*>
(
args
.
workspace
.
raw_ptr
+
...
...
@@ -109,16 +82,20 @@ void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec(
reinterpret_cast
<
void
*>
(
args
.
workspace
.
raw_ptr
+
args
.
filter_layout
->
span
().
dist_byte
()
+
args
.
bias_layout
->
span
().
dist_byte
());
reorder_filter_bias
(
args
,
reduce_filter_ptr
,
filter_ptr
,
bias_ptr
);
void
*
reduce_workspace
=
reinterpret_cast
<
void
*>
(
args
.
workspace
.
raw_ptr
+
args
.
filter_layout
->
span
().
dist_byte
()
+
args
.
bias_layout
->
span
().
dist_byte
()
+
args
.
bias_layout
->
span
().
dist_byte
());
reorder_filter
(
args
,
filter_ptr
);
update_bias
(
args
,
bias_ptr
,
reduce_filter_ptr
,
reduce_workspace
);
}
return
{
filter_ptr
,
bias_ptr
};
}
ConvParam
kern_param
;
kern_param
.
n
=
n
,
kern_param
.
co
=
co
,
kern_param
.
ci
=
ci
,
kern_param
.
hi
=
hi
,
kern_param
.
wi
=
wi
,
kern_param
.
ho
=
ho
,
kern_param
.
wo
=
wo
,
kern_param
.
ph
=
ph
,
kern_param
.
pw
=
pw
,
kern_param
.
sh
=
sh
,
kern_param
.
sw
=
sw
,
kern_param
.
fh
=
fh
,
kern_param
.
fw
=
fw
;
std
::
tuple
<
float
,
float
,
float
,
float
,
float
>
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
get_constants
(
const
ExecArgs
&
args
)
const
{
float
src_scale
=
args
.
src_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
,
filter_scale
=
...
...
@@ -128,125 +105,67 @@ void ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm::exec(
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
uint8_t
src_zero
=
args
.
src_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
()
.
zero_point
,
dst_zero
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
()
.
zero_point
;
float
alpha
=
src_scale
*
filter_scale
/
dst_scale
;
float
beta
=
bias_scale
/
dst_scale
;
float
gamma
=
0.
f
;
float
delta
=
0.
f
;
float
theta
=
dst_zero
;
uint8_t
dst_zero
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
zero_point
;
float
alpha
=
src_scale
*
filter_scale
/
dst_scale
,
beta
=
bias_scale
/
dst_scale
,
gamma
=
0.
f
,
delta
=
0.
f
,
theta
=
dst_zero
;
uint8_t
*
z_dev_ptr
=
nullptr
;
if
(
args
.
z_layout
->
ndim
>
0
)
{
z_dev_ptr
=
reinterpret_cast
<
uint8_t
*>
(
args
.
z_tensor
->
raw_ptr
);
float
z_scale
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
gamma
=
z_scale
/
dst_scale
;
uint8_t
z_zero
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
zero_point
;
gamma
=
z_scale
/
dst_scale
;
delta
=
-
z_zero
*
gamma
;
}
uint32_t
nonlinear_mode
=
static_cast
<
uint32_t
>
(
param
.
nonlineMode
);
cutlass_wrapper
::
do_conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64
<
true
>
(
reinterpret_cast
<
uint8_t
*>
(
args
.
src_tensor
->
raw_ptr
),
reinterpret_cast
<
int8_t
*>
(
filter_ptr
),
reinterpret_cast
<
int32_t
*>
(
bias_ptr
),
z_dev_ptr
,
reinterpret_cast
<
uint8_t
*>
(
args
.
dst_tensor
->
raw_ptr
),
nullptr
,
kern_param
,
nonlinear_mode
,
alpha
,
beta
,
gamma
,
delta
,
theta
,
dst_scale
,
src_zero
,
cutlass_wrapper
::
GemmCoord
{
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
},
cutlass_wrapper
::
GemmCoord
{
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
},
stream
);
return
{
alpha
,
beta
,
gamma
,
delta
,
theta
};
}
std
::
string
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
to_string
(
AlgoParam
algo_param
)
{
return
ssprintf
(
"%uX%uX%u_%uX%uX%u"
,
algo_param
.
threadblock_m
,
algo_param
.
threadblock_n
,
algo_param
.
threadblock_k
,
algo_param
.
warp_m
,
algo_param
.
warp_n
,
algo_param
.
warp_k
);
void
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
do_exec
(
const
ExecArgs
&
args
,
void
*
filter_ptr
,
void
*
bias_ptr
,
void
*
z_ptr
,
ConvParam
kern_param
,
uint32_t
nonlinear_mode
,
float
alpha
,
float
beta
,
float
gamma
,
float
delta
,
float
theta
,
cudaStream_t
stream
)
const
{
float
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
uint8_t
src_zero
=
args
.
src_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
zero_point
;
cutlass_wrapper
::
GemmCoord
threadblock_shape
{
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
};
cutlass_wrapper
::
GemmCoord
warp_shape
{
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
};
cutlass_wrapper
::
do_conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64
<
true
>
(
reinterpret_cast
<
uint8_t
*>
(
args
.
src_tensor
->
raw_ptr
),
reinterpret_cast
<
int8_t
*>
(
filter_ptr
),
reinterpret_cast
<
int32_t
*>
(
bias_ptr
),
reinterpret_cast
<
uint8_t
*>
(
z_ptr
),
reinterpret_cast
<
uint8_t
*>
(
args
.
dst_tensor
->
raw_ptr
),
nullptr
,
kern_param
,
nonlinear_mode
,
alpha
,
beta
,
gamma
,
delta
,
theta
,
dst_scale
,
src_zero
,
threadblock_shape
,
warp_shape
,
stream
);
}
size_t
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
get_preprocess_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
void
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
update_bias
(
const
ExecArgs
&
args
,
void
*
updated_bias
,
void
*
reduce_filter_ptr
,
void
*
reduce_workspace
)
const
{
size_t
co
=
args
.
filter_layout
->
operator
[](
0
),
ci
=
args
.
filter_layout
->
operator
[](
1
)
*
64
,
fh
=
args
.
filter_layout
->
operator
[](
2
),
fw
=
args
.
filter_layout
->
operator
[](
3
);
size_t
ws_size_reduce_filter
=
co
*
sizeof
(
int32_t
);
size_t
A
=
co
,
B
=
ci
*
fh
*
fw
/
8
,
C
=
1
;
ws_size_reduce_filter
+=
do_dispatch_reduce_workspace_in_bytes
(
A
,
B
,
C
);
return
ws_size_reduce_filter
;
}
SmallVector
<
TensorLayout
>
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
deduce_preprocessed_filter_layout
(
const
SizeArgs
&
args
)
const
{
return
{
args
.
filter_layout
->
collapse_contiguous
(),
args
.
bias_layout
->
collapse_contiguous
()};
}
void
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
exec_preprocess
(
const
ExecArgs
&
args
)
const
{
megdnn_assert
(
args
.
preprocessed_filter
->
tensors
.
size
()
==
2
);
reorder_filter_bias
(
args
,
args
.
workspace
.
raw_ptr
,
args
.
preprocessed_filter
->
tensors
[
0
].
raw_ptr
,
args
.
preprocessed_filter
->
tensors
[
1
].
raw_ptr
);
}
void
ConvBiasForwardImpl
::
AlgoUInt4Int4NCHW64IMMAImplicitGemm
::
reorder_filter_bias
(
const
ExecArgs
&
args
,
void
*
reduce_filter
,
void
*
reordered_filter
,
void
*
reordered_bias
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
fm
=
args
.
filter_meta
;
size_t
n
=
args
.
src_layout
->
operator
[](
0
),
ci
=
args
.
src_layout
->
operator
[](
1
)
*
64
,
hi
=
args
.
src_layout
->
operator
[](
2
),
wi
=
args
.
src_layout
->
operator
[](
3
);
size_t
co
=
args
.
dst_layout
->
operator
[](
1
)
*
64
,
ho
=
args
.
dst_layout
->
operator
[](
2
),
wo
=
args
.
dst_layout
->
operator
[](
3
);
UNPACK_CONV_PARAMETER
(
fm
,
param
);
MARK_USED_VAR
;
auto
&&
stream
=
cuda_stream
(
args
.
opr
->
handle
());
// filter: KCRS64 => CRSK64
TensorLayout
src
{{
co
,
ci
/
64
,
fh
,
fw
,
64
},
dtype
::
QuantizedS4
()};
src
.
init_contiguous_stride
();
TensorLayout
dst
=
src
;
dst
.
stride
[
0
]
=
64
;
dst
.
stride
[
1
]
=
co
*
fh
*
fw
*
64
;
dst
.
stride
[
2
]
=
co
*
fw
*
64
;
dst
.
stride
[
3
]
=
co
*
64
;
dst
.
stride
[
4
]
=
1
;
TensorND
ts_src
,
ts_dst
;
ts_src
.
raw_ptr
=
args
.
filter_tensor
->
raw_ptr
;
ts_src
.
layout
=
src
;
ts_dst
.
raw_ptr
=
reordered_filter
;
ts_dst
.
layout
=
dst
;
auto
&&
transpose
=
args
.
opr
->
handle
()
->
create_operator
<
RelayoutForward
>
();
transpose
->
exec
(
ts_src
,
ts_dst
);
// reduce filter and update bias
int32_t
*
workspace
=
reinterpret_cast
<
int32_t
*>
(
reordered_bias
)
+
args
.
bias_layout
->
span
().
dist_byte
();
int
src_zero_point
=
args
.
src_tensor
->
layout
.
dtype
.
param
<
dtype
::
Quantized4Asymm
>
()
.
zero_point
;
do_dispatch_reduce_filter_and_update_bias_4bit
<
true
>
(
reinterpret_cast
<
uint8_t
*>
(
args
.
filter_tensor
->
raw_ptr
),
args
.
bias_tensor
->
compatible_ptr
<
int32_t
>
(),
co
,
ci
*
fh
*
fw
/
8
,
reinterpret_cast
<
int32_t
*>
(
reordered_bias
),
workspace
,
src_zero_point
,
stream
);
reinterpret_cast
<
int32_t
*>
(
updated_bias
),
reinterpret_cast
<
int32_t
*>
(
reduce_workspace
),
src_zero_point
,
stream
);
}
#endif
...
...
dnn/src/cuda/conv_bias/opr_impl.h
浏览文件 @
e661ae90
...
...
@@ -64,6 +64,7 @@ public:
class
AlgoInt8CHWN4IMMAImplicitGemmReorderFilter
;
class
AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth
;
class
AlgoInt8NCHW32IMMAImplicitGemm
;
class
AlgoInt4NCHW64IMMAImplicitGemmBase
;
class
AlgoInt4Int4NCHW64IMMAImplicitGemm
;
class
AlgoUInt4Int4NCHW64IMMAImplicitGemm
;
class
AlgoBFloat16
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录