Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
43bd2a35
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
43bd2a35
编写于
5月 21, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Improve transpose perf
上级
139a62b9
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
144 addition
and
24 deletion
+144
-24
mace/kernels/transpose.h
mace/kernels/transpose.h
+104
-22
mace/ops/transpose_benchmark.cc
mace/ops/transpose_benchmark.cc
+3
-0
mace/ops/transpose_test.cc
mace/ops/transpose_test.cc
+37
-2
未找到文件。
mace/kernels/transpose.h
浏览文件 @
43bd2a35
...
...
@@ -15,6 +15,10 @@
#ifndef MACE_KERNELS_TRANSPOSE_H_
#define MACE_KERNELS_TRANSPOSE_H_
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include <vector>
#include "mace/core/future.h"
...
...
@@ -25,6 +29,64 @@
namespace
mace
{
namespace
kernels
{
static
void
TransposeNHWCToNCHWC3
(
const
float
*
input
,
float
*
output
,
const
index_t
height
,
const
index_t
width
)
{
index_t
image_size
=
height
*
width
;
#pragma omp parallel for
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
index_t
in_offset
=
h
*
width
*
3
;
index_t
out_offset
=
h
*
width
;
index_t
w
;
for
(
w
=
0
;
w
+
3
<
width
;
w
+=
4
)
{
float32x4x3_t
vi
=
vld3q_f32
(
input
+
in_offset
);
vst1q_f32
(
output
+
out_offset
,
vi
.
val
[
0
]);
vst1q_f32
(
output
+
out_offset
+
image_size
,
vi
.
val
[
1
]);
vst1q_f32
(
output
+
out_offset
+
image_size
*
2
,
vi
.
val
[
2
]);
in_offset
+=
12
;
out_offset
+=
4
;
}
for
(;
w
<
width
;
++
w
)
{
for
(
index_t
c
=
0
;
c
<
3
;
++
c
)
{
output
[
h
*
width
+
image_size
*
c
+
w
]
=
input
[
h
*
width
*
3
+
w
*
3
+
c
];
}
}
}
}
static
void
TransposeNCHWToNHWCC2
(
const
float
*
input
,
float
*
output
,
const
index_t
height
,
const
index_t
width
)
{
index_t
image_size
=
height
*
width
;
#pragma omp parallel for
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
index_t
in_offset
=
h
*
width
;
index_t
out_offset
=
h
*
width
*
2
;
index_t
w
;
for
(
w
=
0
;
w
+
3
<
width
;
w
+=
4
)
{
float32x4_t
vi0
=
vld1q_f32
(
input
+
in_offset
);
float32x4_t
vi1
=
vld1q_f32
(
input
+
in_offset
+
image_size
);
vst2q_f32
(
output
+
out_offset
,
{
vi0
,
vi1
});
in_offset
+=
4
;
out_offset
+=
8
;
}
for
(;
w
<
width
;
++
w
)
{
for
(
index_t
c
=
0
;
c
<
2
;
++
c
)
{
output
[
h
*
width
*
2
+
w
*
2
+
c
]
=
input
[
h
*
width
+
image_size
*
c
+
w
];
}
}
}
}
template
<
DeviceType
D
,
typename
T
>
struct
TransposeFunctor
{
explicit
TransposeFunctor
(
const
std
::
vector
<
int
>
&
dims
)
:
dims_
(
dims
)
{}
...
...
@@ -48,6 +110,25 @@ struct TransposeFunctor {
}
}
}
else
if
(
input
->
dim_size
()
==
4
)
{
std
::
vector
<
int
>
transpose_order_from_NHWC_to_NCHW
{
0
,
3
,
1
,
2
};
std
::
vector
<
int
>
transpose_order_from_NCHW_to_NHWC
{
0
,
2
,
3
,
1
};
index_t
batch_size
=
input
->
dim
(
1
)
*
input
->
dim
(
2
)
*
input
->
dim
(
3
);
if
(
dims_
==
transpose_order_from_NHWC_to_NCHW
&&
input
->
dim
(
3
)
==
3
)
{
for
(
index_t
b
=
0
;
b
<
input
->
dim
(
0
);
++
b
)
{
TransposeNHWCToNCHWC3
(
input_data
+
b
*
batch_size
,
output_data
+
b
*
batch_size
,
input
->
dim
(
1
),
input
->
dim
(
2
));
}
}
else
if
(
dims_
==
transpose_order_from_NCHW_to_NHWC
&&
input
->
dim
(
1
)
==
2
)
{
for
(
index_t
b
=
0
;
b
<
input
->
dim
(
0
);
++
b
)
{
TransposeNCHWToNHWCC2
(
input_data
+
b
*
batch_size
,
output_data
+
b
*
batch_size
,
input
->
dim
(
2
),
input
->
dim
(
3
));
}
}
else
{
std
::
vector
<
index_t
>
in_stride
{
input_shape
[
1
]
*
input_shape
[
2
]
*
input_shape
[
3
],
input_shape
[
2
]
*
input_shape
[
3
],
input_shape
[
3
],
1
};
...
...
@@ -74,6 +155,7 @@ struct TransposeFunctor {
}
}
}
}
}
else
{
MACE_NOT_IMPLEMENTED
;
}
...
...
mace/ops/transpose_benchmark.cc
浏览文件 @
43bd2a35
...
...
@@ -83,6 +83,9 @@ void TransposeBenchmark(int iters,
#define BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \
BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU);
BM_TRANSPOSE4D
(
1
,
512
,
512
,
3
,
0
,
3
,
1
,
2
);
BM_TRANSPOSE4D
(
1
,
2
,
512
,
512
,
0
,
2
,
3
,
1
);
BM_TRANSPOSE4D
(
1
,
64
,
64
,
512
,
0
,
3
,
1
,
2
);
BM_TRANSPOSE4D
(
1
,
512
,
64
,
64
,
0
,
2
,
3
,
1
);
BM_TRANSPOSE2D
(
128
,
128
);
...
...
mace/ops/transpose_test.cc
浏览文件 @
43bd2a35
...
...
@@ -37,16 +37,51 @@ void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
// Run on cpu
net
.
RunOp
();
net
.
FillNHWCInputToNCHWInput
<
DeviceType
::
CPU
,
float
>
(
"InputNCHW"
,
"Input"
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
DataFormat
::
NHWC
,
"InputNCHW"
,
DataFormat
::
NCHW
);
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"InputNCHW"
),
*
net
.
GetOutput
(
"Output"
));
}
void
TransposeNHWCTest
(
const
std
::
vector
<
index_t
>
&
input_shape
)
{
// Construct graph
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
CPU
,
float
>
(
"Input"
,
input_shape
);
OpDefBuilder
(
"Transpose"
,
"TransposeNHWCTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"dims"
,
{
0
,
2
,
3
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
DataFormat
::
NCHW
,
"InputNHWC"
,
DataFormat
::
NHWC
);
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"InputNHWC"
),
*
net
.
GetOutput
(
"Output"
));
}
}
// namespace
TEST_F
(
TransposeOpTest
,
NCHW
)
{
TEST_F
(
TransposeOpTest
,
N
HWC_to_N
CHW
)
{
TransposeNCHWTest
({
3
,
64
,
64
,
128
});
TransposeNCHWTest
({
1
,
64
,
48
,
128
});
TransposeNCHWTest
({
1
,
512
,
512
,
3
});
TransposeNCHWTest
({
2
,
512
,
512
,
3
});
}
TEST_F
(
TransposeOpTest
,
NCHW_to_NHWC
)
{
TransposeNHWCTest
({
1
,
2
,
512
,
512
});
TransposeNHWCTest
({
1
,
3
,
512
,
512
});
TransposeNHWCTest
({
2
,
2
,
512
,
512
});
}
TEST_F
(
TransposeOpTest
,
Rank2
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录