Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
8fef78d0
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
396
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
8fef78d0
编写于
3月 22, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn/cuda): add relayout format when width is an odd number
GitOrigin-RevId: f059f1f56dd66c33633118c893027ddd50ac8f1d
上级
91d61607
变更
6
展开全部
隐藏空白更改
内联
并排
Showing
6 changed file
with
512 addition
and
186 deletion
+512
-186
dnn/src/common/relayout_format.cpp
dnn/src/common/relayout_format.cpp
+1
-1
dnn/src/cuda/relayout_format/relayout_format.cu
dnn/src/cuda/relayout_format/relayout_format.cu
+435
-114
dnn/test/common/benchmarker.h
dnn/test/common/benchmarker.h
+5
-4
dnn/test/common/checker.cpp
dnn/test/common/checker.cpp
+10
-64
dnn/test/common/utils.h
dnn/test/common/utils.h
+17
-0
dnn/test/cuda/relayout_format.cpp
dnn/test/cuda/relayout_format.cpp
+44
-3
未找到文件。
dnn/src/common/relayout_format.cpp
浏览文件 @
8fef78d0
...
...
@@ -380,7 +380,7 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) {
break
;
}
if
(
!
dst
.
is_default
()
&&
if
(
dst
.
type
()
==
TensorFormat
::
Type
::
IMAGE2D_PACK4
&&
(
handle
()
->
type
()
!=
Handle
::
HandleType
::
NAIVE
))
{
#if MEGDNN_ENABLE_MANGLING
...
...
dnn/src/cuda/relayout_format/relayout_format.cu
浏览文件 @
8fef78d0
此差异已折叠。
点击以展开。
dnn/test/common/benchmarker.h
浏览文件 @
8fef78d0
...
...
@@ -87,10 +87,11 @@ public:
for
(
size_t
i
=
0
;
i
<
shapes
.
size
();
++
i
)
{
DType
dt
=
(
m_dtype
.
find
(
i
)
!=
m_dtype
.
end
()
?
m_dtype
[
i
]
:
dtype
::
Float32
());
TensorFormat
fmt
=
(
m_fmt
.
find
(
i
)
!=
m_fmt
.
end
()
?
m_fmt
[
i
]
:
DefaultTensorFormat
::
make
());
layouts
[
i
]
=
TensorLayout
(
shapes
[
i
],
dt
,
fmt
);
if
(
m_fmt
.
find
(
i
)
==
m_fmt
.
end
())
{
layouts
[
i
]
=
TensorLayout
(
shapes
[
i
],
dt
);
layouts
[
i
].
init_contiguous_stride
();
}
else
layouts
[
i
]
=
TensorLayout
(
shapes
[
i
],
dt
,
m_fmt
[
i
]);
}
return
layouts
;
}
...
...
dnn/test/common/checker.cpp
浏览文件 @
8fef78d0
...
...
@@ -19,7 +19,6 @@ using namespace megdnn;
using
namespace
test
;
namespace
{
template
<
typename
ctype
,
class
Iter
>
::
testing
::
AssertionResult
assert_tensor_eq_with_iter
(
const
char
*
expr0
,
const
char
*
expr1
,
...
...
@@ -30,7 +29,7 @@ namespace {
double
error_sum
=
0
;
double
error_sum_biased
=
0
;
for
(
size_t
i
=
0
;
i
<
nr_elem
;
++
i
)
{
ctype
iv0
=
*
it0
,
iv1
=
*
it1
;
ctype
iv0
=
ctype
(
*
it0
),
iv1
=
ctype
(
*
it1
)
;
float
err
=
diff
(
iv0
,
iv1
);
error_sum
+=
std
::
abs
(
err
);
error_sum_biased
+=
err
;
...
...
@@ -84,12 +83,14 @@ namespace {
const
char
*
expr0
,
const
char
*
expr1
,
const
TensorND
&
v0
,
const
TensorND
&
v1
,
float
maxerr
,
float
maxerr_avg
,
float
maxerr_avg_biased
)
{
if
(
v0
.
layout
.
is_physical_contiguous
()
&&
v1
.
layout
.
is_physical_contiguous
())
{
return
assert_tensor_eq_with_iter
<
ctype
>
(
expr0
,
expr1
,
v0
.
ptr
<
ctype
>
(),
v1
.
ptr
<
ctype
>
(),
v0
.
layout
,
maxerr
,
maxerr_avg
,
maxerr_avg_biased
);
if
(
!
std
::
is_same
<
ctype
,
dt_qint4
>::
value
&&
!
std
::
is_same
<
ctype
,
dt_quint4
>::
value
)
{
if
(
v0
.
layout
.
is_physical_contiguous
()
&&
v1
.
layout
.
is_physical_contiguous
())
{
return
assert_tensor_eq_with_iter
<
ctype
>
(
expr0
,
expr1
,
v0
.
ptr
<
ctype
>
(),
v1
.
ptr
<
ctype
>
(),
v0
.
layout
,
maxerr
,
maxerr_avg
,
maxerr_avg_biased
);
}
}
auto
it0
=
megdnn
::
tensor_iter_valonly
<
ctype
>
(
v0
).
begin
(),
...
...
@@ -100,56 +101,6 @@ namespace {
maxerr_avg_biased
);
}
template
<
typename
ITYPE
>
::
testing
::
AssertionResult
assert_tensor_eq_with_lowbit4
(
const
char
*
expr0
,
const
char
*
expr1
,
const
TensorND
&
v0
,
const
TensorND
&
v1
,
float
maxerr
,
float
maxerr_avg
)
{
if
(
!
v0
.
layout
.
eq_layout
(
v1
.
layout
))
{
return
::
testing
::
AssertionFailure
()
<<
"Layout mismatch for testing equality of lowbit4
\n
"
<<
"Value of: "
<<
expr1
<<
"
\n
"
<<
" Actual: "
<<
v1
.
layout
.
TensorShape
::
to_string
()
<<
"
\n
"
<<
"Expected: "
<<
expr0
<<
"
\n
"
<<
"Which is: "
<<
v0
.
layout
.
TensorShape
::
to_string
()
<<
"
\n
"
;
}
auto
v0_ptr
=
static_cast
<
ITYPE
*>
(
v0
.
raw_ptr
)
-
v0
.
layout
.
span
().
low_byte
;
auto
v1_ptr
=
static_cast
<
ITYPE
*>
(
v1
.
raw_ptr
)
-
v1
.
layout
.
span
().
low_byte
;
double
error_sum
=
0
;
for
(
size_t
i
=
0
;
i
<
v0
.
layout
.
span
().
dist_elem
();
++
i
)
{
ITYPE
iv0
=
(
v0_ptr
[
i
/
2
]
<<
(
i
^
1
)
*
4
);
iv0
=
iv0
>>
4
;
ITYPE
iv1
=
(
v1_ptr
[
i
/
2
]
<<
(
i
^
1
)
*
4
);
iv1
=
iv1
>>
4
;
float
err
=
std
::
abs
(
diff
(
iv0
,
iv1
));
error_sum
+=
err
;
if
(
!
good_float
(
iv0
)
||
!
good_float
(
iv1
)
||
err
>=
maxerr
)
{
Index
index
(
v0
.
layout
,
i
);
return
::
testing
::
AssertionFailure
()
<<
"Unequal value
\n
"
<<
"Value of: "
<<
expr1
<<
"
\n
"
<<
" Actual: "
<<
(
iv1
+
0
)
<<
"
\n
"
<<
"Expected: "
<<
expr0
<<
"
\n
"
<<
"Which is: "
<<
(
iv0
+
0
)
<<
"
\n
"
<<
"At index: "
<<
index
.
to_string
()
<<
"/"
<<
v0
.
layout
.
TensorShape
::
to_string
()
<<
"
\n
"
<<
" Dtype: "
<<
v0
.
layout
.
dtype
.
name
()
<<
"
\n
"
<<
" error: "
<<
err
<<
"/"
<<
maxerr
;
}
}
float
error_avg
=
error_sum
/
v0
.
layout
.
total_nr_elems
();
if
(
error_avg
>
maxerr_avg
)
{
return
::
testing
::
AssertionFailure
()
<<
"Average error too high
\n
"
<<
"Value of: "
<<
expr1
<<
"
\n
"
<<
"Expected: "
<<
expr0
<<
"
\n
"
<<
"Average error: "
<<
error_avg
<<
"/"
<<
maxerr_avg
;
}
return
::
testing
::
AssertionSuccess
();
}
template
<
class
Impl
>
void
memcpy_noncontig
(
void
*
dst
,
const
void
*
src
,
const
TensorLayout
&
layout
,
...
...
@@ -215,12 +166,7 @@ namespace {
//! In order to avoid an unnecessary increase in binary size, we just
//! use QuantizedS16 dtype in winograd_filter_preprocess now.
cb
(
::
megdnn
::
dtype
::
QuantizedS16
)
case
DTypeTrait
<
dtype
::
Quantized4Asymm
>::
enumv
:
return
assert_tensor_eq_with_lowbit4
<
uint8_t
>
(
expr0
,
expr1
,
v0
,
v1
,
maxerr
,
maxerr_avg
);
case
DTypeTrait
<
dtype
::
QuantizedS4
>::
enumv
:
return
assert_tensor_eq_with_lowbit4
<
int8_t
>
(
expr0
,
expr1
,
v0
,
v1
,
maxerr
,
maxerr_avg
);
MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE
(
cb
)
#undef cb
default:
megdnn_trap
();
...
...
dnn/test/common/utils.h
浏览文件 @
8fef78d0
...
...
@@ -228,6 +228,14 @@ static inline int diff(dt_qint8 x, dt_qint8 y) {
return
x
.
as_int8
()
-
y
.
as_int8
();
}
static
inline
int
diff
(
dt_qint4
x
,
dt_qint4
y
)
{
return
x
.
as_int8
()
-
y
.
as_int8
();
}
static
inline
int
diff
(
dt_quint4
x
,
dt_quint4
y
)
{
return
x
.
as_uint8
()
-
y
.
as_uint8
();
}
inline
TensorShape
cvt_src_or_dst_nchw2nhwc
(
const
TensorShape
&
shape
)
{
megdnn_assert
(
shape
.
ndim
==
4
);
auto
N
=
shape
[
0
],
C
=
shape
[
1
],
H
=
shape
[
2
],
W
=
shape
[
3
];
...
...
@@ -356,6 +364,15 @@ static inline int operator+(dt_qint16 lhs, int rhs) {
return
lhs
.
as_int16
();
}
static
inline
int
operator
+
(
dt_quint4
lhs
,
int
rhs
)
{
megdnn_assert
(
rhs
==
0
,
"unexpected rhs"
);
return
lhs
.
as_uint8
();
}
static
inline
int
operator
+
(
dt_qint4
lhs
,
int
rhs
)
{
megdnn_assert
(
rhs
==
0
,
"unexpected rhs"
);
return
lhs
.
as_int8
();
}
}
// namespace test
static
inline
bool
operator
==
(
const
TensorLayout
&
a
,
const
TensorLayout
&
b
)
{
...
...
dnn/test/cuda/relayout_format.cpp
浏览文件 @
8fef78d0
...
...
@@ -11,13 +11,14 @@
*/
#include "megdnn/dtype.h"
#include "megdnn/oprs.h"
#include "test/c
ommon/benchmarker
.h"
#include "test/c
uda/benchmark
.h"
#include "test/common/checker.h"
#include "test/common/rng.h"
#include "test/cuda/fixture.h"
using
namespace
megdnn
;
using
namespace
test
;
#define MEGDNN_WITH_BENCHMARK 1
TEST_F
(
CUDA
,
RELAYOUT_FORMAT
)
{
Checker
<
RelayoutFormat
>
checker
(
handle_cuda
());
...
...
@@ -246,7 +247,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW64) {
for
(
size_t
n
:
{
1
,
3
})
{
for
(
size_t
c
:
{
64
,
128
})
{
for
(
size_t
h
:
{
7
,
14
,
16
,
28
})
{
for
(
size_t
w
:
{
2
,
4
,
14
,
16
})
{
for
(
size_t
w
:
{
2
,
3
,
7
,
8
,
16
,
31
})
{
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS4
{
2.
f
})
.
set_dtype
(
1
,
dtype
::
QuantizedS4
{
2.
f
})
.
set_rng
(
0
,
&
s4
)
...
...
@@ -286,7 +287,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW64_NCHW) {
for
(
size_t
n
:
{
1
,
3
})
{
for
(
size_t
c
:
{
64
,
128
})
{
for
(
size_t
h
:
{
7
,
14
,
16
,
28
})
{
for
(
size_t
w
:
{
2
,
4
,
14
,
16
})
{
for
(
size_t
w
:
{
2
,
3
,
4
,
7
,
14
,
16
,
17
})
{
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS4
{
2.
f
})
.
set_dtype
(
1
,
dtype
::
QuantizedS4
{
2.
f
})
.
set_rng
(
0
,
&
s4
)
...
...
@@ -366,6 +367,46 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
run
(
shapes
,
param
,
default_param
);
}
}
TEST_F
(
CUDA
,
BENCHMARK_RELAYOUT_FORMAT_QS4
)
{
using
Param
=
RelayoutFormat
::
Param
;
auto
run
=
[
&
](
const
TensorShapeArray
&
shapes
,
Param
param
)
{
CUBenchmarker
<
RelayoutFormat
>
benchmarker
(
handle_cuda
());
benchmarker
.
set_param
(
param
);
benchmarker
.
set_dtype
(
0
,
dtype
::
QuantizedS4
{
1.19990307
f
})
.
set_dtype
(
1
,
dtype
::
QuantizedS4
{
1.20210322
f
});
for
(
auto
&&
shape
:
shapes
)
{
double
memaccess
=
double
(
shape
.
total_nr_elems
())
*
1e-6
;
auto
time_ms
=
benchmarker
.
execs
({
shape
,
{}});
printf
(
"execute %s, time %.4f ms, %.4f GB/s
\n
"
,
shape
.
to_string
().
c_str
(),
time_ms
,
memaccess
/
time_ms
);
}
};
{
TensorShapeArray
shapes
=
{
{
1
,
64
,
56
,
56
},
{
16
,
64
,
56
,
56
},
{
64
,
64
,
56
,
56
},
{
1
,
64
,
56
,
55
},
{
16
,
64
,
56
,
55
},
{
64
,
64
,
56
,
55
},
};
Param
param
;
param
.
mode
=
param
::
RelayoutFormat
::
Mode
::
NCHW_NCHW64
;
run
(
shapes
,
param
);
}
{
TensorShapeArray
shapes
=
{
{
64
,
1
,
56
,
56
,
64
},
{
1
,
32
,
7
,
7
,
64
},
{
16
,
32
,
7
,
7
,
64
},
{
64
,
32
,
7
,
7
,
64
},
};
Param
param
;
param
.
mode
=
param
::
RelayoutFormat
::
Mode
::
NCHW64_NCHW
;
run
(
shapes
,
param
);
}
}
#endif
TEST_F
(
CUDA
,
RELAYOUT_FORMAT_NCHW4
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录