Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
327b02b3
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
327b02b3
编写于
10月 16, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Improve performance of activation
上级
89b49ec0
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
165 addition
and
16 deletion
+165
-16
mace/kernels/activation.h
mace/kernels/activation.h
+33
-0
mace/kernels/arm/activation_neon.cc
mace/kernels/arm/activation_neon.cc
+71
-0
mace/kernels/arm/activation_neon.h
mace/kernels/arm/activation_neon.h
+31
-0
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+30
-16
未找到文件。
mace/kernels/activation.h
浏览文件 @
327b02b3
...
...
@@ -25,6 +25,7 @@
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/kernels/kernel.h"
#include "mace/kernels/arm/activation_neon.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -98,6 +99,38 @@ void DoActivation(const T *input_ptr,
}
}
template
<
>
inline
void
DoActivation
(
const
float
*
input_ptr
,
float
*
output_ptr
,
const
index_t
size
,
const
ActivationType
type
,
const
float
relux_max_limit
)
{
switch
(
type
)
{
case
NOOP
:
break
;
case
RELU
:
ReluNeon
(
input_ptr
,
size
,
output_ptr
);
break
;
case
RELUX
:
ReluxNeon
(
input_ptr
,
relux_max_limit
,
size
,
output_ptr
);
break
;
case
TANH
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
tanh
(
input_ptr
[
i
]);
}
break
;
case
SIGMOID
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
1
/
(
1
+
std
::
exp
(
-
input_ptr
[
i
]));
}
break
;
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
type
;
}
}
template
<
typename
T
>
void
PReLUActivation
(
const
T
*
input_ptr
,
const
index_t
outer_size
,
...
...
mace/kernels/arm/activation_neon.cc
0 → 100644
浏览文件 @
327b02b3
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include <algorithm>
#include "mace/kernels/arm/activation_neon.h"
namespace
mace
{
namespace
kernels
{
void
ReluNeon
(
const
float
*
input
,
const
index_t
size
,
float
*
output
)
{
#if defined(MACE_ENABLE_NEON)
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<=
size
-
4
;
i
+=
4
)
{
float32x4_t
v
=
vld1q_f32
(
input
+
i
);
v
=
vmaxq_f32
(
v
,
vzero
);
vst1q_f32
(
output
+
i
,
v
);
}
// remain
for
(
index_t
i
=
(
size
>>
2
)
<<
2
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input
[
i
],
0.
f
);
}
#else
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input
[
i
],
0.
f
);
}
#endif
}
void
ReluxNeon
(
const
float
*
input
,
const
float
limit
,
const
index_t
size
,
float
*
output
)
{
#if defined(MACE_ENABLE_NEON)
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vlimit
=
vdupq_n_f32
(
limit
);
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<=
size
-
4
;
i
+=
4
)
{
float32x4_t
v
=
vld1q_f32
(
input
+
i
);
v
=
vmaxq_f32
(
v
,
vzero
);
v
=
vminq_f32
(
v
,
vlimit
);
vst1q_f32
(
output
+
i
,
v
);
}
// remain
for
(
index_t
i
=
(
size
>>
2
)
<<
2
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
std
::
max
(
input
[
i
],
0.
f
),
limit
);
}
#else
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
std
::
max
(
input
[
i
],
0.
f
),
limit
);
}
#endif
}
}
// namespace kernels
}
// namespace mace
mace/kernels/arm/activation_neon.h
0 → 100644
浏览文件 @
327b02b3
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_ARM_ACTIVATION_NEON_H_
#define MACE_KERNELS_ARM_ACTIVATION_NEON_H_
#include "mace/core/types.h"
namespace
mace
{
namespace
kernels
{
void
ReluNeon
(
const
float
*
input
,
const
index_t
size
,
float
*
output
);
void
ReluxNeon
(
const
float
*
input
,
const
float
limit
,
const
index_t
size
,
float
*
output
);
}
// namespace kernels
}
// namespace mace
#endif // MACE_KERNELS_ARM_ACTIVATION_NEON_H_
mace/kernels/conv_2d.h
浏览文件 @
327b02b3
...
...
@@ -544,6 +544,19 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
&
sgemm_
,
scratch
);
};
}
else
if
(
use_neon_1x1_s1
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK1x1S1
(
pad_input
,
filter_data
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
channels
,
pad_output
,
&
sgemm_
,
scratch
);
};
}
else
if
(
use_neon_3x3_s1
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK3x3S1
(
pad_input
,
...
...
@@ -560,19 +573,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
extra_output_shape
,
pad_output
);
};
}
else
if
(
use_neon_1x1_s1
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK1x1S1
(
pad_input
,
filter_data
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
channels
,
pad_output
,
&
sgemm_
,
scratch
);
};
}
else
if
(
use_neon_5x5_s1
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK5x5S1
(
pad_input
,
...
...
@@ -699,13 +699,27 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
}
if
(
bias_data
!=
nullptr
)
{
const
index_t
image_size
=
height
*
width
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
height
*
width
;
++
i
)
{
output_data
[(
b
*
channels
+
c
)
*
height
*
width
+
i
]
+=
bias_data
[
c
];
float
*
output_ptr
=
output_data
+
(
b
*
channels
+
c
)
*
image_size
;
const
float
bias
=
bias_data
[
c
];
#if defined(MACE_ENABLE_NEON)
float32x4_t
vbias
=
vdupq_n_f32
(
bias
);
for
(
index_t
i
=
0
;
i
<=
image_size
-
4
;
i
+=
4
)
{
float32x4_t
v
=
vld1q_f32
(
output_ptr
+
i
);
v
=
vaddq_f32
(
v
,
vbias
);
vst1q_f32
(
output_ptr
+
i
,
v
);
}
for
(
index_t
i
=
(
image_size
>>
2
)
<<
2
;
i
<
image_size
;
++
i
)
{
output_ptr
[
i
]
+=
bias
;
}
#else
for
(
index_t
i
=
0
;
i
<
image_size
;
++
i
)
{
output_ptr
[
i
]
+=
bias
;
}
#endif
}
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录