Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
9304d7cc
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
9304d7cc
编写于
11月 08, 2017
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'relu-opencl' into 'master'
Finish relu opencl kernel. See merge request !96
上级
2e4c4928
73af2b22
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
231 addition
and
37 deletion
+231
-37
mace/kernels/neon/relu_neon.cc
mace/kernels/neon/relu_neon.cc
+5
-3
mace/kernels/opencl/cl/relu.cl
mace/kernels/opencl/cl/relu.cl
+32
-0
mace/kernels/opencl/relu_opencl.cc
mace/kernels/opencl/relu_opencl.cc
+59
-0
mace/kernels/relu.h
mace/kernels/relu.h
+11
-6
mace/ops/relu.cc
mace/ops/relu.cc
+1
-0
mace/ops/relu.h
mace/ops/relu.h
+1
-4
mace/ops/relu_benchmark.cc
mace/ops/relu_benchmark.cc
+5
-2
mace/ops/relu_test.cc
mace/ops/relu_test.cc
+117
-22
未找到文件。
mace/kernels/neon/relu_neon.cc
浏览文件 @
9304d7cc
...
...
@@ -9,9 +9,11 @@ namespace mace {
namespace
kernels
{
template
<
>
void
ReluFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
float
*
input
,
float
*
output
,
index_t
size
)
{
void
ReluFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
)
{
const
float
*
input
=
input_tensor
->
data
<
float
>
();
float
*
output
=
output_tensor
->
mutable_data
<
float
>
();
index_t
size
=
input_tensor
->
size
();
if
(
max_limit_
<
0
)
{
#pragma omp parallel for
for
(
int64_t
i
=
0
;
i
<
size
;
i
+=
kCostPerGroup
)
{
...
...
mace/kernels/opencl/cl/relu.cl
0 → 100644
浏览文件 @
9304d7cc
__kernel
void
relu
(
__global
const
float
*input,
__private
const
int
size,
__global
float
*output
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
+
4
>
size
)
{
for
(
; idx < size; ++idx) {
*
(
output+idx
)
=
fmax
(
*
(
input+idx
)
,
0
)
;
}
}
else
{
float4
data
=
vload4
(
idx,
input
)
;
data
=
fmax
(
data,
(
float4
)
0
)
;
vstore4
(
data,
idx,
output
)
;
}
}
__kernel
void
relux
(
__global
const
float
*input,
__private
const
float
max_limit,
__private
const
int
size,
__global
float
*output
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
+
4
>
size
)
{
for
(
; idx < size; ++idx) {
*
(
output+idx
)
=
clamp
(
*
(
input+idx
)
,
0.0f,
max_limit
)
;
}
}
else
{
float4
data
=
vload4
(
idx,
input
)
;
data
=
clamp
(
data,
(
float4
)
0
,
(
float4
)
max_limit
)
;
vstore4
(
data,
idx,
output
)
;
}
}
mace/kernels/opencl/relu_opencl.cc
0 → 100644
浏览文件 @
9304d7cc
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/relu.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace
mace
{
namespace
kernels
{
template
<
>
void
ReluFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
)
{
index_t
element_size
=
input
->
NumElements
();
index_t
blocks
=
(
element_size
+
3
)
/
4
;
const
uint32_t
gws
=
blocks
;
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
if
(
max_limit_
<
0
)
{
auto
relu_kernel
=
cl
::
Kernel
(
program
,
"relu"
);
const
uint32_t
lws
=
runtime
->
GetKernelMaxWorkGroupSize
(
relu_kernel
);
uint32_t
idx
=
0
;
relu_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
relu_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
element_size
));
relu_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
relu_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
),
cl
::
NDRange
(
lws
));
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
else
{
auto
relu_kernel
=
cl
::
Kernel
(
program
,
"relux"
);
const
uint32_t
lws
=
runtime
->
GetKernelMaxWorkGroupSize
(
relu_kernel
);
uint32_t
idx
=
0
;
relu_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
relu_kernel
.
setArg
(
idx
++
,
max_limit_
);
relu_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
element_size
));
relu_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
relu_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
),
cl
::
NDRange
(
lws
));
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/relu.h
浏览文件 @
9304d7cc
...
...
@@ -14,23 +14,28 @@ template <DeviceType D, typename T>
struct
ReluFunctor
{
T
max_limit_
;
void
operator
()(
const
T
*
input
,
T
*
output
,
index_t
size
)
{
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
)
{
const
T
*
input_ptr
=
input
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
index_t
size
=
input
->
size
();
if
(
max_limit_
<
0
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
max
(
input
[
i
],
static_cast
<
T
>
(
0
));
output
_ptr
[
i
]
=
std
::
max
(
input_ptr
[
i
],
static_cast
<
T
>
(
0
));
}
}
else
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output
[
i
]
=
std
::
min
(
std
::
max
(
input
[
i
],
static_cast
<
T
>
(
0
)),
max_limit_
);
output
_ptr
[
i
]
=
std
::
min
(
std
::
max
(
input_ptr
[
i
],
static_cast
<
T
>
(
0
)),
max_limit_
);
}
}
}
};
template
<
>
void
ReluFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
float
*
input
,
float
*
output
,
index_t
size
);
void
ReluFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
);
template
<
>
void
ReluFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
);
}
// namespace kernels
}
// namespace mace
...
...
mace/ops/relu.cc
浏览文件 @
9304d7cc
...
...
@@ -12,4 +12,5 @@ REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
REGISTER_NEON_OPERATOR
(
Relu
,
ReluOp
<
DeviceType
::
NEON
,
float
>
);
#endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR
(
Relu
,
ReluOp
<
DeviceType
::
OPENCL
,
float
>
);
}
// namespace mace
mace/ops/relu.h
浏览文件 @
9304d7cc
...
...
@@ -22,11 +22,8 @@ class ReluOp : public Operator<D, T> {
const
Tensor
*
input_tensor
=
this
->
inputs_
[
0
];
Tensor
*
output_tensor
=
this
->
outputs_
[
0
];
output_tensor
->
ResizeLike
(
input_tensor
);
const
T
*
input
=
input_tensor
->
data
<
T
>
();
T
*
output
=
output_tensor
->
mutable_data
<
T
>
();
index_t
size
=
input_tensor
->
size
();
functor_
(
input
,
output
,
size
);
functor_
(
input
_tensor
,
output_tensor
);
return
true
;
}
...
...
mace/ops/relu_benchmark.cc
浏览文件 @
9304d7cc
...
...
@@ -19,17 +19,19 @@ static void ReluBenchmark(int iters, int size) {
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
D
eviceType
::
CPU
,
float
>
(
"Input"
,
{
size
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
size
});
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
RunOp
(
D
);
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
RunOp
(
D
);
}
net
.
Sync
();
}
#define BM_RELU_MACRO(SIZE, TYPE, DEVICE) \
...
...
@@ -43,7 +45,8 @@ static void ReluBenchmark(int iters, int size) {
#define BM_RELU(SIZE, TYPE) \
BM_RELU_MACRO(SIZE, TYPE, CPU); \
BM_RELU_MACRO(SIZE, TYPE, NEON);
BM_RELU_MACRO(SIZE, TYPE, NEON);\
BM_RELU_MACRO(SIZE, TYPE, OPENCL);
BM_RELU
(
1000
,
float
);
BM_RELU
(
100000
,
float
);
...
...
mace/ops/relu_test.cc
浏览文件 @
9304d7cc
...
...
@@ -9,51 +9,146 @@ namespace mace {
class
ReluOpTest
:
public
OpsTestBase
{};
TEST_F
(
ReluOpTest
,
ReluOp
)
{
// Construct graph
auto
&
net
=
test_net
()
;
template
<
DeviceType
D
>
void
TestSimple
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Relu"
,
"ReluTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
5
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
2
,
2
,
2
,
2
},
{
-
7
,
7
,
-
6
,
6
,
-
5
,
5
,
-
4
,
4
,
-
3
,
3
,
-
2
,
2
,
-
1
,
1
,
0
,
0
});
// Run
net
.
RunOp
();
net
.
RunOp
(
D
);
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
auto
expected
=
CreateTensor
<
float
>
({
2
,
2
,
2
,
2
},
{
0
,
7
,
0
,
6
,
0
,
5
,
0
,
4
,
0
,
3
,
0
,
2
,
0
,
1
,
0
,
0
});
// Check
net
.
RunOp
(
DeviceType
::
NEON
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
TEST_F
(
ReluOpTest
,
CPUSimple
)
{
TestSimple
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ReluOpTest
,
NEONSimple
)
{
TestSimple
<
DeviceType
::
NEON
>
();
}
TEST_F
(
ReluOpTest
,
OPENCLSimple
)
{
TestSimple
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
void
TestUnalignedSimple
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Relu"
,
"ReluTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
0.01
);
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
1
,
3
,
2
},
{
-
7
,
7
,
-
6
,
6
,
-
5
,
5
});
// Run
net
.
RunOp
(
D
);
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
3
,
2
},
{
0
,
7
,
0
,
6
,
0
,
5
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
TEST_F
(
ReluOpTest
,
CPUUnalignedSimple
)
{
TestUnalignedSimple
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ReluOpTest
,
NEONUnalignedSimple
)
{
TestUnalignedSimple
<
DeviceType
::
NEON
>
();
}
TEST_F
(
ReluOpTest
,
ReluOpWithMax
)
{
// Construct graph
auto
&
net
=
test_net
();
OpDefBuilder
(
"Relu"
,
"ReluTestWithMax"
)
TEST_F
(
ReluOpTest
,
OPENCLUnalignedSimple
)
{
TestUnalignedSimple
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
void
TestSimpleReluX
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Relu"
,
"ReluTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
AddFloatArg
(
"max_limit"
,
0.5
)
.
AddFloatArg
(
"max_limit"
,
6
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
5
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
2
,
2
,
2
,
2
},
{
-
7
,
7
,
-
6
,
6
,
-
5
,
5
,
-
4
,
4
,
-
3
,
3
,
-
2
,
2
,
-
1
,
1
,
0
,
0
});
// Run
net
.
RunOp
();
net
.
RunOp
(
D
);
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
auto
expected
=
CreateTensor
<
float
>
({
2
,
2
,
2
,
2
},
{
0
,
6
,
0
,
6
,
0
,
5
,
0
,
4
,
0
,
3
,
0
,
2
,
0
,
1
,
0
,
0
});
// Check
net
.
RunOp
(
DeviceType
::
NEON
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
TEST_F
(
ReluOpTest
,
CPUSimpleReluX
)
{
TestSimpleReluX
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ReluOpTest
,
NEONSimpleReluX
)
{
TestSimpleReluX
<
DeviceType
::
NEON
>
();
}
TEST_F
(
ReluOpTest
,
OPENCLSimpleReluX
)
{
TestSimpleReluX
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
void
TestUnalignedSimpleReluX
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Relu"
,
"ReluTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
AddFloatArg
(
"max_limit"
,
6
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
1
,
1
,
7
},
{
-
7
,
7
,
-
6
,
6
,
-
5
,
5
,
-
4
});
// Run
net
.
RunOp
(
D
);
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
7
},
{
0
,
6
,
0
,
6
,
0
,
5
,
0
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
TEST_F
(
ReluOpTest
,
CPUUnalignedSimpleReluX
)
{
TestUnalignedSimpleReluX
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ReluOpTest
,
NEONUnalignedSimpleReluX
)
{
TestUnalignedSimpleReluX
<
DeviceType
::
NEON
>
();
}
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
0.01
);
TEST_F
(
ReluOpTest
,
OPENCLUnalignedSimpleReluX
)
{
TestUnalignedSimpleReluX
<
DeviceType
::
OPENCL
>
();
}
}
// namespace mace
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录