Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
19a554d6
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
19a554d6
编写于
3月 18, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
test(dnn/cuda): add testcase for transforming tensor layout between nchw and nchw64
GitOrigin-RevId: 75d579635ad177d9391b8da6ca45fab1086d3f6a
上级
71c2f612
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
201 addition
and
32 deletion
+201
-32
dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp
dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp
+2
-2
dnn/src/cuda/relayout_format/opr_impl.cpp
dnn/src/cuda/relayout_format/opr_impl.cpp
+4
-1
dnn/src/cuda/relayout_format/relayout_format.cpp
dnn/src/cuda/relayout_format/relayout_format.cpp
+3
-2
dnn/src/cuda/relayout_format/relayout_format.cu
dnn/src/cuda/relayout_format/relayout_format.cu
+52
-25
dnn/src/cuda/utils.cuh
dnn/src/cuda/utils.cuh
+6
-2
dnn/src/naive/relayout_format/opr_impl.cpp
dnn/src/naive/relayout_format/opr_impl.cpp
+51
-0
dnn/test/cuda/relayout_format.cpp
dnn/test/cuda/relayout_format.cpp
+83
-0
未找到文件。
dnn/src/cuda/conv_bias/fallback_nchw_qs4.cpp
浏览文件 @
19a554d6
...
...
@@ -161,7 +161,7 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS4::get_workspace_bundle(
ws_size_underlying_algo
,
ws_size_z
}};
}
return
WorkspaceBundle
{
raw_ptr
,
{
ws_size_src
,
ws_size_filter
,
ws_size_underlying_algo
,
ws_size_dst
}};
{
ws_size_src
,
ws_size_filter
,
ws_size_dst
,
ws_size_underlying_algo
}};
}
// vim: syntax=cpp.doxygen
dnn/src/cuda/relayout_format/opr_impl.cpp
浏览文件 @
19a554d6
...
...
@@ -30,7 +30,10 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
param
().
mode
==
param
::
RelayoutFormat
::
Mode
::
CHWN4_NCHW4
||
param
().
mode
==
Param
::
Mode
::
NCHW_NCHW4_IC_SMALL
||
param
().
mode
==
Param
::
Mode
::
NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT
,
Param
::
Mode
::
NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT
||
param
().
mode
==
Param
::
Mode
::
NCHW_NCHW64
||
param
().
mode
==
Param
::
Mode
::
NCHW64_NCHW
,
"relayout format of cuda only support NCHW4->CHWN4 or "
"CHWN4->NCHW4 or NCHW->NCHW4"
);
if
((
param
().
mode
==
param
::
RelayoutFormat
::
Mode
::
NCHW4_CHWN4
||
...
...
dnn/src/cuda/relayout_format/relayout_format.cpp
浏览文件 @
19a554d6
...
...
@@ -26,6 +26,9 @@ inline void get_scale_zeropoint(const DType& tensor_dtype, float& scale,
scale
=
tensor_dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
}
else
if
(
tensor_dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS4
)
{
scale
=
tensor_dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
}
else
if
(
tensor_dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
zero_point
=
tensor_dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
zero_point
;
scale
=
tensor_dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
}
}
...
...
@@ -41,8 +44,6 @@ void relayout_format::RelayoutFormatFast::exec(const TensorND& src,
cudaStream_t
stream
,
RelayoutFormat
::
Param
::
Mode
mode
,
int
group
)
{
auto
&&
stype
=
src
.
layout
.
dtype
;
auto
&&
dtype
=
dst
.
layout
.
dtype
;
float
src_scale
=
1.
f
;
float
dst_scale
=
1.
f
;
uint8_t
src_zero_point
=
0
;
...
...
dnn/src/cuda/relayout_format/relayout_format.cu
浏览文件 @
19a554d6
...
...
@@ -538,9 +538,9 @@ struct Translayout<64, 8, SrcType, dtype::QuantizedS4, dtype::QuantizedS4,
};
#undef pack
#define pack(_idx)
\
((
uint8_t)(post_process(intermediate[0][_idx])
) | \
(
(uint8_t)(post_process(intermediate[1][_idx])
) << 4))
#define pack(_idx) \
((
post_process(intermediate[0][_idx]) & 0xf
) | \
(
post_process(intermediate[1][_idx]
) << 4))
template
<
typename
SrcType
,
bool
same_scale
>
struct
Translayout
<
64
,
2
,
SrcType
,
dtype
::
QuantizedS4
,
dtype
::
QuantizedS4
,
same_scale
>
{
...
...
@@ -648,9 +648,9 @@ struct Translayout<64, 8, SrcType, dtype::Quantized4Asymm,
};
#undef pack
#define pack(_idx)
\
(
(uint8_t)(post_process(intermediate[0][_idx])
) | \
(
(uint8_t)(post_process(intermediate[1][_idx])
) << 4))
#define pack(_idx) \
(
post_process(intermediate[0][_idx]
) | \
(
post_process(intermediate[1][_idx]
) << 4))
template
<
typename
SrcType
,
bool
same_scale
>
struct
Translayout
<
64
,
2
,
SrcType
,
dtype
::
Quantized4Asymm
,
dtype
::
Quantized4Asymm
,
same_scale
>
{
...
...
@@ -820,13 +820,25 @@ __global__ void kern_nchw_nchwx(
int
n_stride_src
,
int
ic_stride
,
int
n_stride_dst
,
int
oc_stride
,
CudaPostProcess
<
DnnSrcType
,
DnnDstType
,
same_scale
>
post_process
,
const
char
zero_point
,
const
int
group
,
const
int
ocpg
)
{
static
constexpr
int
size_src_type
=
sizeof
(
SrcType
);
static
constexpr
int
size_dst_type
=
sizeof
(
DstType
);
#ifndef MEGDNN_COMMA
#define MEGDNN_COMMA ,
#endif
MEGDNN_STATIC_ASSERT
(
std
::
is_same
<
SrcType
MEGDNN_COMMA
DstType
>::
value
,
"Currently this kernel only support accessing tensor "
"src and dst in same data type."
);
n_stride_src
/=
size_src_type
;
ic_stride
/=
size_src_type
;
n_stride_dst
/=
size_dst_type
;
oc_stride
/=
size_dst_type
;
const
int
n_idx
=
blockIdx
.
y
;
const
int
ihw_block_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
ihw_offset
=
ihw_block_idx
*
pack_w
;
const
int
ihw_offset_in_type
=
ihw_offset
*
size_nbits
/
(
8
*
sizeof
(
SrcType
));
ihw_offset
*
size_nbits
/
(
8
*
size_src_type
);
if
(
ihw_offset
<
ihw
)
{
const
int
src_offset_base
=
n_idx
*
n_stride_src
+
ihw_offset_in_type
;
const
int
dst_offset_base
=
...
...
@@ -836,7 +848,7 @@ __global__ void kern_nchw_nchwx(
const
int
ic_block
=
icpg
/
pack_c
;
const
int
remain_ic
=
icpg
%
pack_c
;
const
int
src_group_stride
=
icpg
*
ic_stride
;
const
int
dst_group_stride
=
ocpg
*
oc_stride
;
const
int
dst_group_stride
=
(
ocpg
/
pack_c
)
*
oc_stride
;
for
(
int
g_idx
=
0
;
g_idx
<
group
;
++
g_idx
)
{
const
int
src_offset
=
src_offset_base
+
g_idx
*
src_group_stride
;
...
...
@@ -1018,7 +1030,7 @@ public:
int
chan_stride_in_elements_
,
int
channel_
)
:
pointer
{
pointer_
},
chan_stride_in_elements
{
chan_stride_in_elements
},
chan_stride_in_elements
{
chan_stride_in_elements
_
},
channel
{
channel_
}
{}
MEGDNN_DEVICE
__forceinline__
void
load
(
Fragment
&
frag
)
{
...
...
@@ -1031,7 +1043,7 @@ public:
int
frag_idx
=
i
/
pack_size
*
(
lane_size_in_type
/
pack_size_in_type
)
+
j
;
bool
guard
=
i
>=
channel
;
bool
guard
=
i
<
channel
;
cutlass
::
arch
::
global_load
<
AccessType
,
pack_size_in_byte
>
(
frag_ptr
[
frag_idx
],
reinterpret_cast
<
void
*>
(
pointer_
+
...
...
@@ -1052,7 +1064,7 @@ public:
int
frag_idx
=
i
/
pack_size
*
(
lane_size_in_type
/
pack_size_in_type
)
+
j
;
bool
guard
=
i
>=
channel
;
bool
guard
=
i
<
channel
;
cutlass
::
arch
::
global_store
<
AccessType
,
pack_size_in_byte
>
(
frag_ptr
[
frag_idx
],
reinterpret_cast
<
void
*>
(
pointer_
+
...
...
@@ -1092,11 +1104,24 @@ __global__ void kern_nchwx_nchw(
size_nbits
>
;
using
Transpose
=
Translayout
<
pack_c
,
pack_w
,
SrcType
,
DnnSrcType
,
DnnDstType
,
same_scale
>
;
static
constexpr
int
size_src_type
=
sizeof
(
SrcType
);
static
constexpr
int
size_dst_type
=
sizeof
(
DstType
);
MEGDNN_STATIC_ASSERT
(
std
::
is_same
<
SrcType
MEGDNN_COMMA
DstType
>::
value
,
"Currently this kernel only support accessing tensor "
"src and dst in same data type."
);
n_stride_src
/=
size_src_type
;
ic_stride
/=
size_src_type
;
n_stride_dst
/=
size_dst_type
;
oc_stride
/=
size_dst_type
;
#undef MEGDNN_COMMA
const
int
n_idx
=
blockIdx
.
y
;
const
int
ihw_block_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
ihw_offset
=
ihw_block_idx
*
pack_w
;
const
int
ihw_offset_in_type
=
ihw_offset
*
size_nbits
/
(
8
*
sizeof
(
SrcType
));
ihw_offset
*
size_nbits
/
(
8
*
size_src_type
);
const
int
oc_stride_inner_dtype
=
oc_stride
*
size_dst_type
/
sizeof
(
InnerDtype
);
if
(
ihw_offset
<
ihw
)
{
const
int
ic_block
=
(
ic
+
pack_c
-
1
)
/
pack_c
;
const
int
src_offset_base
=
...
...
@@ -1105,8 +1130,8 @@ __global__ void kern_nchwx_nchw(
SrcIterator
src_iterator
{
const_cast
<
SrcType
*>
(
src
+
src_offset_base
),
ic_stride
,
ic
};
DstIteraotr
dst_iterator
{
reinterpret_cast
<
InnerDtype
*>
(
dst
+
dst_offset_base
),
oc_stride
,
ic
};
reinterpret_cast
<
InnerDtype
*>
(
dst
+
dst_offset_base
),
oc_stride_inner_dtype
,
ic
};
for
(
int
ic_blk_idx
=
0
;
ic_blk_idx
<
ic_block
;
++
ic_blk_idx
)
{
typename
SrcIterator
::
Fragment
src_frag
;
...
...
@@ -1143,12 +1168,13 @@ void relayout_format::relayout_format_cuda_nchw_nchwx(
DEF
(
64
,
Quantized4Asymm
,
Quantized4Asymm
)
DEF
(
4
,
QuantizedS8
,
QuantizedS8
)
DEF
(
4
,
Uint8
,
QuantizedS8
)
DEF
(
4
,
Quantized8Asymm
,
Quantized
8Asymm
)
DEF
(
4
,
QuantizedS32
,
QuantizedS32
)
;
DEF
(
4
,
Quantized8Asymm
,
Quantized
S8
)
DEF
(
4
,
QuantizedS32
,
QuantizedS32
)
// clang-format on
megdnn_assert
(
pack_oc
==
4
||
pack_oc
==
64
,
"Unsupport pack size(pack_oc:%d)"
,
pack_oc
);
#undef DEF
"Unsupport pack size(pack_oc:%d, src:%s, dst:%s)"
,
pack_oc
,
stype
.
name
(),
dtype
.
name
());
#undef DEF
const
int
in_n
=
src
.
layout
[
0
];
const
int
out_n
=
dst
.
layout
[
0
];
const
int
ic
=
src
.
layout
[
1
];
...
...
@@ -1157,6 +1183,7 @@ void relayout_format::relayout_format_cuda_nchw_nchwx(
const
int
oc
=
dst
.
layout
[
1
]
*
pack_oc
;
const
int
hw
=
h
*
w
;
const
int
ocpg
=
oc
/
group
;
// stride in byte
const
int
n_stride_src
=
src_layout
.
dtype
.
size
(
src_layout
.
stride
[
0
]);
const
int
ic_stride
=
src_layout
.
dtype
.
size
(
src_layout
.
stride
[
1
]);
const
int
n_stride_dst
=
dst_layout
.
dtype
.
size
(
dst_layout
.
stride
[
0
]);
...
...
@@ -1244,20 +1271,20 @@ void relayout_format::relayout_format_cuda_nchwx_nchw(
auto
&
src_layout
=
src
.
layout
;
auto
&
dst_layout
=
dst
.
layout
;
// check pack size
int
pack_
o
c
=
std
::
numeric_limits
<
int
>::
min
();
#define DEF(_pack_
o
c, _src_type, _dst_type) \
int
pack_
i
c
=
std
::
numeric_limits
<
int
>::
min
();
#define DEF(_pack_
i
c, _src_type, _dst_type) \
if (stype.enumv().ev == DTypeEnum::Ev::_src_type && \
dtype.enumv().ev == DTypeEnum::Ev::_dst_type) { \
pack_
oc = _pack_o
c; \
pack_
ic = _pack_i
c; \
}
// clang-format off
DEF
(
64
,
QuantizedS4
,
QuantizedS4
)
DEF
(
64
,
Quantized4Asymm
,
Quantized4Asymm
)
// clang-format on
megdnn_assert
(
pack_
oc
==
64
,
"Unsupport pack size(pack_oc:%d)"
,
pack_o
c
);
megdnn_assert
(
pack_
ic
==
64
,
"Unsupport pack size(pack_ic:%d)"
,
pack_i
c
);
#undef DEF
const
int
n
=
src
.
layout
[
0
];
const
int
c
=
src
.
layout
[
1
];
const
int
c
=
src
.
layout
[
1
]
*
pack_ic
;
const
int
h
=
src
.
layout
[
2
];
// align to byte
const
int
w
=
src
.
layout
[
3
];
...
...
@@ -1266,7 +1293,7 @@ void relayout_format::relayout_format_cuda_nchwx_nchw(
const
int
ic_stride
=
src_layout
.
dtype
.
size
(
src_layout
.
stride
[
1
]);
const
int
n_stride_dst
=
dst_layout
.
dtype
.
size
(
dst_layout
.
stride
[
0
]);
const
int
oc_stride
=
dst_layout
.
dtype
.
size
(
dst_layout
.
stride
[
1
]);
bool
same_scale
=
src_scale
==
dst_scale
;
#define DISPATCH_RAW(_same_scale, _pack_w, _pack_oc, _src_type, _dst_type, \
_src_c_type, _dst_c_type, _size_nbits) \
...
...
dnn/src/cuda/utils.cuh
浏览文件 @
19a554d6
...
...
@@ -378,7 +378,9 @@ MEGDNN_DEVICE __forceinline__ static float4 operator+(float4 lval,
MEGDNN_DEVICE
__forceinline__
static
int
transform_int8_to_int4x8
(
int
s0
,
int
s1
,
int
s2
,
int
s3
,
int
s4
,
int
s5
,
int
s6
,
int
s7
)
{
unsigned
out
;
#if __CUDA_ARCH__ >= 750
#if __CUDA_ARCH__ >= 750 && \
((__CUDACC_VER_MAJOR__ > 10) || \
((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
asm
volatile
(
"{ .reg .u32 r4;"
"cvt.pack.sat.s4.s32.b32 r4, %8, %7, 0;"
...
...
@@ -411,7 +413,9 @@ MEGDNN_DEVICE __forceinline__ static int transform_int8_to_int4x8(
MEGDNN_DEVICE
__forceinline__
static
int
transform_int8_to_uint4x8
(
int
s0
,
int
s1
,
int
s2
,
int
s3
,
int
s4
,
int
s5
,
int
s6
,
int
s7
)
{
unsigned
out
;
#if __CUDA_ARCH__ >= 750
#if __CUDA_ARCH__ >= 750 && \
((__CUDACC_VER_MAJOR__ > 10) || \
((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
asm
volatile
(
"{ .reg .u32 r4;"
"cvt.pack.sat.u4.s32.b32 r4, %8, %7, 0;"
...
...
dnn/src/naive/relayout_format/opr_impl.cpp
浏览文件 @
19a554d6
...
...
@@ -226,6 +226,7 @@ void do_copy_diff_q8_q8(const TensorND& dst, const TensorND& src) {
++
isrc
;
}
}
void
do_copy_diff_q32_q32
(
const
TensorND
&
dst
,
const
TensorND
&
src
)
{
auto
isrc
=
tensor_iter_valonly
<
DTypeTrait
<
dtype
::
QuantizedS32
>::
ctype
>
(
src
)
.
begin
();
...
...
@@ -253,6 +254,38 @@ void do_copy_diff_u8_q8(const TensorND& dst, const TensorND& src) {
}
}
void
do_copy_diff_q4_q4
(
const
TensorND
&
dst
,
const
TensorND
&
src
)
{
auto
isrc
=
tensor_iter_valonly
<
DTypeTrait
<
dtype
::
QuantizedS4
>::
ctype
>
(
src
)
.
begin
();
auto
idst
=
tensor_iter_valonly
<
DTypeTrait
<
dtype
::
QuantizedS4
>::
ctype
>
(
dst
)
.
begin
();
auto
src_dt_parm
=
src
.
layout
.
dtype
.
param
<
dtype
::
QuantizedS4
>
();
auto
dst_dt_parm
=
dst
.
layout
.
dtype
.
param
<
dtype
::
QuantizedS4
>
();
for
(
size_t
i
=
0
,
it
=
dst
.
layout
.
total_nr_elems
();
i
<
it
;
++
i
)
{
*
idst
=
dst_dt_parm
.
quantize
(
src_dt_parm
.
dequantize
(
int8_t
(
*
isrc
)));
++
idst
;
++
isrc
;
}
}
void
do_copy_diff_qu4_qu4
(
const
TensorND
&
dst
,
const
TensorND
&
src
)
{
auto
isrc
=
tensor_iter_valonly
<
DTypeTrait
<
dtype
::
Quantized4Asymm
>::
ctype
>
(
src
)
.
begin
();
auto
idst
=
tensor_iter_valonly
<
DTypeTrait
<
dtype
::
Quantized4Asymm
>::
ctype
>
(
dst
)
.
begin
();
auto
src_dt_parm
=
src
.
layout
.
dtype
.
param
<
dtype
::
Quantized4Asymm
>
();
auto
dst_dt_parm
=
dst
.
layout
.
dtype
.
param
<
dtype
::
Quantized4Asymm
>
();
for
(
size_t
i
=
0
,
it
=
dst
.
layout
.
total_nr_elems
();
i
<
it
;
++
i
)
{
*
idst
=
dst_dt_parm
.
quantize
(
src_dt_parm
.
dequantize
(
uint8_t
(
*
isrc
)));
++
idst
;
++
isrc
;
}
}
void
check_layout_and_canonize
(
TensorLayout
&
src
,
TensorLayout
&
dst
)
{
megdnn_assert
(
dst
.
is_non_overlapping_strong
());
src
=
src
.
collapse_contiguous
();
...
...
@@ -595,6 +628,24 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
};
MEGDNN_DISPATCH_CPU_KERN_OPR
(
func
(
dst0
,
src0
));
return
;
}
else
if
(
src
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS4
&&
dst
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS4
)
{
TensorND
src0
=
exec_src_nd
,
dst0
=
exec_dst_nd
;
check_layout_and_canonize
(
src0
.
layout
,
src0
.
layout
);
auto
func
=
[](
const
TensorND
&
dst
,
const
TensorND
&
src
)
{
do_copy_diff_q4_q4
(
dst
,
src
);
};
MEGDNN_DISPATCH_CPU_KERN_OPR
(
func
(
dst0
,
src0
));
return
;
}
else
if
(
src
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
&&
dst
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
TensorND
src0
=
exec_src_nd
,
dst0
=
exec_dst_nd
;
check_layout_and_canonize
(
src0
.
layout
,
src0
.
layout
);
auto
func
=
[](
const
TensorND
&
dst
,
const
TensorND
&
src
)
{
do_copy_diff_qu4_qu4
(
dst
,
src
);
};
MEGDNN_DISPATCH_CPU_KERN_OPR
(
func
(
dst0
,
src0
));
return
;
}
else
{
m_handle
->
relayout_opr
()
->
exec
(
exec_src_nd
,
exec_dst_nd
,
handle
());
}
...
...
dnn/test/cuda/relayout_format.cpp
浏览文件 @
19a554d6
...
...
@@ -237,6 +237,89 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_IC_SMALL) {
.
execs
({{
8
,
3
,
768
,
1280
},
{}});
}
TEST_F
(
CUDA
,
RELAYOUT_FORMAT_NCHW_NCHW64
)
{
Checker
<
RelayoutFormat
>
checker
(
handle_cuda
());
UniformIntRNG
s4
{
-
8
,
7
};
UniformIntRNG
u4
{
0
,
15
};
param
::
RelayoutFormat
param
;
param
.
mode
=
param
::
RelayoutFormat
::
Mode
::
NCHW_NCHW64
;
for
(
size_t
n
:
{
1
,
3
})
{
for
(
size_t
c
:
{
64
,
128
})
{
for
(
size_t
h
:
{
7
,
14
,
16
,
28
})
{
for
(
size_t
w
:
{
2
,
4
,
14
,
16
})
{
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS4
{
2.
f
})
.
set_dtype
(
1
,
dtype
::
QuantizedS4
{
2.
f
})
.
set_rng
(
0
,
&
s4
)
.
set_param
(
param
)
.
execs
({{
n
,
c
,
h
,
w
},
{}});
checker
.
set_dtype
(
0
,
dtype
::
Quantized4Asymm
{
1.2
f
,
8
})
.
set_dtype
(
1
,
dtype
::
Quantized4Asymm
{
1.2
f
,
4
})
.
set_rng
(
0
,
&
u4
)
.
set_param
(
param
)
.
execs
({{
n
,
c
,
h
,
w
},
{}});
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS4
{
1.19990307
f
})
.
set_dtype
(
1
,
dtype
::
QuantizedS4
{
1.
f
})
.
set_rng
(
0
,
&
s4
)
.
set_param
(
param
)
.
execs
({{
n
,
c
,
h
,
w
},
{}});
checker
.
set_dtype
(
0
,
dtype
::
Quantized4Asymm
{
1.19990307
f
,
8
})
.
set_dtype
(
1
,
dtype
::
Quantized4Asymm
{
1.
f
,
4
})
.
set_rng
(
0
,
&
u4
)
.
set_param
(
param
)
.
set_epsilon
(
1e-3
)
.
execs
({{
n
,
c
,
h
,
w
},
{}});
}
}
}
}
}
TEST_F
(
CUDA
,
RELAYOUT_FORMAT_NCHW64_NCHW
)
{
Checker
<
RelayoutFormat
>
checker
(
handle_cuda
());
UniformIntRNG
s4
{
-
8
,
7
};
UniformIntRNG
u4
{
0
,
15
};
param
::
RelayoutFormat
param
;
param
.
mode
=
param
::
RelayoutFormat
::
Mode
::
NCHW64_NCHW
;
for
(
size_t
n
:
{
1
,
3
})
{
for
(
size_t
c
:
{
64
,
128
})
{
for
(
size_t
h
:
{
7
,
14
,
16
,
28
})
{
for
(
size_t
w
:
{
2
,
4
,
14
,
16
})
{
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS4
{
2.
f
})
.
set_dtype
(
1
,
dtype
::
QuantizedS4
{
2.
f
})
.
set_rng
(
0
,
&
s4
)
.
set_param
(
param
)
.
set_epsilon
(
1e-3
)
.
execs
({{
n
,
c
/
64
,
h
,
w
,
64
},
{}});
checker
.
set_dtype
(
0
,
dtype
::
Quantized4Asymm
{
1.2
f
,
4
})
.
set_dtype
(
1
,
dtype
::
Quantized4Asymm
{
1.2
f
,
8
})
.
set_rng
(
0
,
&
u4
)
.
set_param
(
param
)
.
set_epsilon
(
1e-3
)
.
execs
({{
n
,
c
/
64
,
h
,
w
,
64
},
{}});
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS4
{
1.19990307
f
})
.
set_dtype
(
1
,
dtype
::
QuantizedS4
{
1.
f
})
.
set_rng
(
0
,
&
s4
)
.
set_param
(
param
)
.
set_epsilon
(
1e-3
)
.
execs
({{
n
,
c
/
64
,
h
,
w
,
64
},
{}});
checker
.
set_dtype
(
0
,
dtype
::
Quantized4Asymm
{
1.20211209
f
,
8
})
.
set_dtype
(
1
,
dtype
::
Quantized4Asymm
{
1.
f
,
4
})
.
set_rng
(
0
,
&
u4
)
.
set_param
(
param
)
.
set_epsilon
(
1e-3
)
.
execs
({{
n
,
c
/
64
,
h
,
w
,
64
},
{}});
}
}
}
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_RELAYOUT_FORMAT
)
{
using
Param
=
RelayoutFormat
::
Param
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录