Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
4410ecd2
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
4410ecd2
编写于
3月 07, 2018
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Reformatting code and enable cpplint
上级
b26187f0
变更
128
显示空白变更内容
内联
并排
Showing
128 changed file
with
1634 addition
and
1831 deletion
+1634
-1831
.gitlab-ci.yml
.gitlab-ci.yml
+9
-0
mace/core/allocator.h
mace/core/allocator.h
+2
-2
mace/core/arg_helper.cc
mace/core/arg_helper.cc
+2
-2
mace/core/buffer.h
mace/core/buffer.h
+55
-88
mace/core/mace.cc
mace/core/mace.cc
+134
-291
mace/core/net.cc
mace/core/net.cc
+7
-7
mace/core/operator.h
mace/core/operator.h
+2
-2
mace/core/preallocated_pooled_allocator.h
mace/core/preallocated_pooled_allocator.h
+2
-2
mace/core/runtime/hexagon/hexagon_control_wrapper.cc
mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+69
-83
mace/core/runtime/hexagon/hexagon_control_wrapper.h
mace/core/runtime/hexagon/hexagon_control_wrapper.h
+8
-10
mace/core/runtime/hexagon/hexagon_controller_dummy.cc
mace/core/runtime/hexagon/hexagon_controller_dummy.cc
+140
-26
mace/core/runtime/hexagon/hexagon_nn.h
mace/core/runtime/hexagon/hexagon_nn.h
+135
-67
mace/core/runtime/hexagon/hexagon_nn_ops.h
mace/core/runtime/hexagon/hexagon_nn_ops.h
+5
-5
mace/core/runtime/hexagon/ops.h
mace/core/runtime/hexagon/ops.h
+0
-1
mace/core/runtime/hexagon/quantize.cc
mace/core/runtime/hexagon/quantize.cc
+5
-5
mace/core/runtime/hexagon/quantize.h
mace/core/runtime/hexagon/quantize.h
+11
-7
mace/core/runtime/opencl/opencl_allocator.cc
mace/core/runtime/opencl/opencl_allocator.cc
+11
-16
mace/core/runtime/opencl/opencl_development.cc
mace/core/runtime/opencl/opencl_development.cc
+3
-2
mace/core/runtime/opencl/opencl_production.cc
mace/core/runtime/opencl/opencl_production.cc
+2
-1
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+16
-21
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+3
-1
mace/core/runtime/opencl/opencl_wrapper.h
mace/core/runtime/opencl/opencl_wrapper.h
+4
-4
mace/core/tensor.h
mace/core/tensor.h
+38
-57
mace/core/testing/test_benchmark.cc
mace/core/testing/test_benchmark.cc
+1
-3
mace/core/testing/test_benchmark.h
mace/core/testing/test_benchmark.h
+1
-1
mace/core/types.cc
mace/core/types.cc
+7
-13
mace/core/workspace.cc
mace/core/workspace.cc
+36
-39
mace/core/workspace.h
mace/core/workspace.h
+2
-2
mace/kernels/activation.h
mace/kernels/activation.h
+12
-10
mace/kernels/addn.h
mace/kernels/addn.h
+1
-1
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+4
-6
mace/kernels/bias_add.h
mace/kernels/bias_add.h
+1
-3
mace/kernels/buffer_to_image.h
mace/kernels/buffer_to_image.h
+6
-8
mace/kernels/channel_shuffle.h
mace/kernels/channel_shuffle.h
+4
-2
mace/kernels/concat.h
mace/kernels/concat.h
+10
-10
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+6
-6
mace/kernels/conv_pool_2d_util.cc
mace/kernels/conv_pool_2d_util.cc
+21
-10
mace/kernels/conv_pool_2d_util.h
mace/kernels/conv_pool_2d_util.h
+1
-1
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+5
-4
mace/kernels/eltwise.h
mace/kernels/eltwise.h
+11
-14
mace/kernels/fully_connected.h
mace/kernels/fully_connected.h
+8
-10
mace/kernels/global_avg_pooling.h
mace/kernels/global_avg_pooling.h
+4
-2
mace/kernels/matmul.h
mace/kernels/matmul.h
+1
-4
mace/kernels/neon/batch_norm_neon.cc
mace/kernels/neon/batch_norm_neon.cc
+2
-1
mace/kernels/neon/conv_2d_neon.cc
mace/kernels/neon/conv_2d_neon.cc
+4
-5
mace/kernels/neon/conv_2d_neon_3x3.cc
mace/kernels/neon/conv_2d_neon_3x3.cc
+4
-8
mace/kernels/neon/depthwise_conv_neon.cc
mace/kernels/neon/depthwise_conv_neon.cc
+4
-5
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+3
-4
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+8
-16
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+3
-6
mace/kernels/opencl/bias_add_opencl.cc
mace/kernels/opencl/bias_add_opencl.cc
+8
-13
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+28
-26
mace/kernels/opencl/cl/common.h
mace/kernels/opencl/cl/common.h
+2
-2
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+24
-29
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+19
-18
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+5
-11
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+5
-10
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+5
-10
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+11
-12
mace/kernels/opencl/eltwise_opencl.cc
mace/kernels/opencl/eltwise_opencl.cc
+6
-14
mace/kernels/opencl/fully_connected_opencl.cc
mace/kernels/opencl/fully_connected_opencl.cc
+12
-23
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+40
-48
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+6
-7
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+9
-18
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+9
-18
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+9
-11
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+6
-11
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+16
-15
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+53
-48
mace/kernels/pooling.h
mace/kernels/pooling.h
+25
-26
mace/kernels/reshape.h
mace/kernels/reshape.h
+1
-2
mace/kernels/resize_bilinear.h
mace/kernels/resize_bilinear.h
+11
-10
mace/kernels/space_to_batch.h
mace/kernels/space_to_batch.h
+11
-11
mace/kernels/winograd_transform.h
mace/kernels/winograd_transform.h
+22
-22
mace/ops/activation.h
mace/ops/activation.h
+2
-1
mace/ops/activation_test.cc
mace/ops/activation_test.cc
+3
-4
mace/ops/addn.h
mace/ops/addn.h
+4
-6
mace/ops/addn_benchmark.cc
mace/ops/addn_benchmark.cc
+1
-2
mace/ops/batch_norm_benchmark.cc
mace/ops/batch_norm_benchmark.cc
+1
-1
mace/ops/batch_to_space.h
mace/ops/batch_to_space.h
+14
-13
mace/ops/batch_to_space_benchmark.cc
mace/ops/batch_to_space_benchmark.cc
+1
-1
mace/ops/bias_add_benchmark.cc
mace/ops/bias_add_benchmark.cc
+1
-1
mace/ops/buffer_to_image.h
mace/ops/buffer_to_image.h
+5
-4
mace/ops/buffer_to_image_test.cc
mace/ops/buffer_to_image_test.cc
+45
-26
mace/ops/channel_shuffle.h
mace/ops/channel_shuffle.h
+2
-2
mace/ops/channel_shuffle_benchmark.cc
mace/ops/channel_shuffle_benchmark.cc
+1
-1
mace/ops/concat.h
mace/ops/concat.h
+3
-2
mace/ops/concat_benchmark.cc
mace/ops/concat_benchmark.cc
+8
-11
mace/ops/concat_test.cc
mace/ops/concat_test.cc
+4
-4
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+47
-57
mace/ops/eltwise.h
mace/ops/eltwise.h
+6
-4
mace/ops/eltwise_benchmark.cc
mace/ops/eltwise_benchmark.cc
+1
-1
mace/ops/eltwise_test.cc
mace/ops/eltwise_test.cc
+41
-55
mace/ops/folded_batch_norm.cc
mace/ops/folded_batch_norm.cc
+12
-15
mace/ops/folded_batch_norm_test.cc
mace/ops/folded_batch_norm_test.cc
+1
-1
mace/ops/fully_connected.h
mace/ops/fully_connected.h
+4
-5
mace/ops/fully_connected_benchmark.cc
mace/ops/fully_connected_benchmark.cc
+16
-12
mace/ops/fully_connected_test.cc
mace/ops/fully_connected_test.cc
+32
-66
mace/ops/fused_conv_2d_test.cc
mace/ops/fused_conv_2d_test.cc
+24
-16
mace/ops/global_avg_pooling.h
mace/ops/global_avg_pooling.h
+1
-1
mace/ops/global_avg_pooling_benchmark.cc
mace/ops/global_avg_pooling_benchmark.cc
+2
-2
mace/ops/image_to_buffer.h
mace/ops/image_to_buffer.h
+5
-4
mace/ops/matmul.h
mace/ops/matmul.h
+2
-2
mace/ops/matmul_test.cc
mace/ops/matmul_test.cc
+27
-35
mace/ops/ops_test_util.h
mace/ops/ops_test_util.h
+4
-5
mace/ops/pooling.h
mace/ops/pooling.h
+6
-2
mace/ops/pooling_benchmark.cc
mace/ops/pooling_benchmark.cc
+2
-2
mace/ops/pooling_test.cc
mace/ops/pooling_test.cc
+4
-2
mace/ops/reshape.h
mace/ops/reshape.h
+5
-3
mace/ops/reshape_test.cc
mace/ops/reshape_test.cc
+0
-1
mace/ops/softmax.cc
mace/ops/softmax.cc
+3
-3
mace/ops/softmax.h
mace/ops/softmax.h
+2
-3
mace/ops/softmax_test.cc
mace/ops/softmax_test.cc
+7
-9
mace/ops/space_to_batch.h
mace/ops/space_to_batch.h
+14
-11
mace/ops/space_to_batch_benchmark.cc
mace/ops/space_to_batch_benchmark.cc
+1
-1
mace/ops/winograd_convolution_test.cc
mace/ops/winograd_convolution_test.cc
+22
-14
mace/ops/winograd_inverse_transform.h
mace/ops/winograd_inverse_transform.h
+2
-2
mace/ops/winograd_transform.h
mace/ops/winograd_transform.h
+3
-3
mace/ops/winograd_transform_benchmark.cc
mace/ops/winograd_transform_benchmark.cc
+20
-22
mace/public/mace.h
mace/public/mace.h
+22
-24
mace/utils/command_line_flags.h
mace/utils/command_line_flags.h
+1
-1
mace/utils/env_time.h
mace/utils/env_time.h
+0
-1
mace/utils/logging.h
mace/utils/logging.h
+1
-1
mace/utils/string_util.h
mace/utils/string_util.h
+1
-1
mace/utils/timer.h
mace/utils/timer.h
+4
-12
mace/utils/tuner_test.cc
mace/utils/tuner_test.cc
+8
-21
mace/utils/utils.h
mace/utils/utils.h
+2
-2
mace/utils/utils_test.cc
mace/utils/utils_test.cc
+8
-21
未找到文件。
.gitlab-ci.yml
浏览文件 @
4410ecd2
stages
:
-
ops_test
-
ops_benchmark
-
cpplint
cpplint
:
stage
:
cpplint
only
:
-
master
script
:
-
curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
-
python cpplint.py --root=mace --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc | grep -vE "half.h")
ops_test
:
stage
:
ops_test
...
...
mace/core/allocator.h
浏览文件 @
4410ecd2
...
...
@@ -9,8 +9,8 @@
#include <malloc.h>
#include "mace/core/registry.h"
#include "mace/public/mace.h"
#include "mace/core/types.h"
#include "mace/public/mace.h"
namespace
mace
{
...
...
@@ -81,7 +81,7 @@ class CPUAllocator : public Allocator {
free
(
data
);
};
void
*
Map
(
void
*
buffer
,
size_t
offset
,
size_t
nbytes
)
const
override
{
return
(
char
*
)
buffer
+
offset
;
return
(
char
*
)
buffer
+
offset
;
}
void
*
MapImage
(
void
*
buffer
,
const
std
::
vector
<
size_t
>
&
image_shape
,
...
...
mace/core/arg_helper.cc
浏览文件 @
4410ecd2
mace/core/buffer.h
浏览文件 @
4410ecd2
...
...
@@ -5,9 +5,9 @@
#ifndef MACE_CORE_BUFFER_H_
#define MACE_CORE_BUFFER_H_
#include "mace/core/types.h"
#include "mace/core/allocator.h"
#include <vector>
#include "mace/core/allocator.h"
#include "mace/core/types.h"
namespace
mace
{
...
...
@@ -39,23 +39,19 @@ class BufferBase {
virtual
bool
OnHost
()
const
=
0
;
virtual
index_t
offset
()
const
{
return
0
;
};
virtual
index_t
offset
()
const
{
return
0
;
};
template
<
typename
T
>
template
<
typename
T
>
const
T
*
data
()
const
{
return
reinterpret_cast
<
const
T
*>
(
raw_data
());
}
template
<
typename
T
>
template
<
typename
T
>
T
*
mutable_data
()
{
return
reinterpret_cast
<
T
*>
(
raw_mutable_data
());
}
index_t
size
()
const
{
return
size_
;
}
index_t
size
()
const
{
return
size_
;
}
protected:
index_t
size_
;
...
...
@@ -155,12 +151,10 @@ class Buffer : public BufferBase {
void
Copy
(
void
*
src
,
index_t
offset
,
index_t
length
)
{
MACE_CHECK_NOTNULL
(
mapped_buf_
);
MACE_CHECK
(
length
<=
size_
,
"out of buffer"
);
memcpy
(
mapped_buf_
,
(
char
*
)
src
+
offset
,
length
);
memcpy
(
mapped_buf_
,
(
char
*
)
src
+
offset
,
length
);
}
bool
OnHost
()
const
{
return
allocator_
->
OnHost
();
}
bool
OnHost
()
const
{
return
allocator_
->
OnHost
();
}
private:
Allocator
*
allocator_
;
...
...
@@ -180,9 +174,10 @@ class Image : public BufferBase {
mapped_buf_
(
nullptr
)
{}
Image
(
std
::
vector
<
size_t
>
shape
,
DataType
data_type
)
:
BufferBase
(
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
())
*
GetEnumTypeSize
(
data_type
)),
:
BufferBase
(
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
())
*
GetEnumTypeSize
(
data_type
)),
allocator_
(
GetDeviceAllocator
(
OPENCL
)),
mapped_buf_
(
nullptr
)
{
shape_
=
shape
;
...
...
@@ -214,9 +209,7 @@ class Image : public BufferBase {
return
mapped_buf_
;
}
std
::
vector
<
size_t
>
image_shape
()
const
{
return
shape_
;
}
std
::
vector
<
size_t
>
image_shape
()
const
{
return
shape_
;
}
void
*
Map
(
index_t
offset
,
index_t
length
,
std
::
vector
<
size_t
>
*
pitch
)
const
{
MACE_NOT_IMPLEMENTED
;
...
...
@@ -241,17 +234,11 @@ class Image : public BufferBase {
mapped_buf_
=
nullptr
;
};
void
Resize
(
index_t
size
)
{
MACE_NOT_IMPLEMENTED
;
}
void
Resize
(
index_t
size
)
{
MACE_NOT_IMPLEMENTED
;
}
void
Copy
(
void
*
src
,
index_t
offset
,
index_t
length
)
{
MACE_NOT_IMPLEMENTED
;
}
void
Copy
(
void
*
src
,
index_t
offset
,
index_t
length
)
{
MACE_NOT_IMPLEMENTED
;
}
bool
OnHost
()
const
{
return
allocator_
->
OnHost
();
}
bool
OnHost
()
const
{
return
allocator_
->
OnHost
();
}
private:
Allocator
*
allocator_
;
...
...
@@ -266,10 +253,7 @@ class Image : public BufferBase {
class
BufferSlice
:
public
BufferBase
{
public:
BufferSlice
()
:
buffer_
(
nullptr
),
mapped_buf_
(
nullptr
),
offset_
(
0
),
length_
(
0
)
{}
:
buffer_
(
nullptr
),
mapped_buf_
(
nullptr
),
offset_
(
0
),
length_
(
0
)
{}
BufferSlice
(
BufferBase
*
buffer
,
index_t
offset
,
index_t
length
)
:
BufferBase
(
buffer
->
size
()),
buffer_
(
buffer
),
...
...
@@ -277,17 +261,11 @@ class BufferSlice : public BufferBase {
offset_
(
offset
),
length_
(
length
)
{
MACE_CHECK
(
offset
>=
0
,
"buffer slice offset should >= 0"
);
MACE_CHECK
(
offset
+
length
<=
size_
,
"buffer slice offset + length ("
,
offset
,
" + "
,
length
,
") should <= "
,
size_
);
}
BufferSlice
(
const
BufferSlice
&
other
)
:
BufferSlice
(
other
.
buffer_
,
other
.
offset_
,
other
.
length_
)
{}
MACE_CHECK
(
offset
+
length
<=
size_
,
"buffer slice offset + length ("
,
offset
,
" + "
,
length
,
") should <= "
,
size_
);
}
BufferSlice
(
const
BufferSlice
&
other
)
:
BufferSlice
(
other
.
buffer_
,
other
.
offset_
,
other
.
length_
)
{}
~
BufferSlice
()
{
if
(
buffer_
!=
nullptr
&&
mapped_buf_
!=
nullptr
)
{
...
...
@@ -303,7 +281,7 @@ class BufferSlice : public BufferBase {
const
void
*
raw_data
()
const
{
if
(
OnHost
())
{
MACE_CHECK_NOTNULL
(
buffer_
);
return
(
char
*
)
buffer_
->
raw_data
()
+
offset_
;
return
(
char
*
)
buffer_
->
raw_data
()
+
offset_
;
}
else
{
MACE_CHECK_NOTNULL
(
mapped_buf_
);
return
mapped_buf_
;
...
...
@@ -320,9 +298,7 @@ class BufferSlice : public BufferBase {
return
nullptr
;
}
void
UnMap
(
void
*
mapped_ptr
)
const
{
MACE_NOT_IMPLEMENTED
;
}
void
UnMap
(
void
*
mapped_ptr
)
const
{
MACE_NOT_IMPLEMENTED
;
}
void
Map
(
std
::
vector
<
size_t
>
*
pitch
)
{
MACE_CHECK_NOTNULL
(
buffer_
);
...
...
@@ -336,21 +312,13 @@ class BufferSlice : public BufferBase {
mapped_buf_
=
nullptr
;
};
void
Resize
(
index_t
size
)
{
MACE_NOT_IMPLEMENTED
;
}
void
Resize
(
index_t
size
)
{
MACE_NOT_IMPLEMENTED
;
}
void
Copy
(
void
*
src
,
index_t
offset
,
index_t
length
)
{
MACE_NOT_IMPLEMENTED
;
}
void
Copy
(
void
*
src
,
index_t
offset
,
index_t
length
)
{
MACE_NOT_IMPLEMENTED
;
}
index_t
offset
()
const
{
return
offset_
;
}
index_t
offset
()
const
{
return
offset_
;
}
bool
OnHost
()
const
{
return
buffer_
->
OnHost
();
}
bool
OnHost
()
const
{
return
buffer_
->
OnHost
();
}
private:
BufferBase
*
buffer_
;
...
...
@@ -358,7 +326,6 @@ class BufferSlice : public BufferBase {
index_t
offset_
;
index_t
length_
;
};
}
#endif // MACE_CORE_BUFFER_H_
mace/core/mace.cc
浏览文件 @
4410ecd2
...
...
@@ -3,9 +3,9 @@
//
#include "mace/public/mace.h"
#include "mace/core/types.h"
#include "mace/core/net.h"
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/types.h"
namespace
mace
{
...
...
@@ -13,11 +13,11 @@ ConstTensor::ConstTensor(const std::string &name,
const
unsigned
char
*
data
,
const
std
::
vector
<
int64_t
>
&
dims
,
const
DataType
data_type
,
uint32_t
node_id
)
:
name_
(
name
),
uint32_t
node_id
)
:
name_
(
name
),
data_
(
data
),
data_size_
(
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
1
,
std
::
multiplies
<
int64_t
>
())),
data_size_
(
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
1
,
std
::
multiplies
<
int64_t
>
())),
dims_
(
dims
.
begin
(),
dims
.
end
()),
data_type_
(
data_type
),
node_id_
(
node_id
)
{}
...
...
@@ -26,33 +26,21 @@ ConstTensor::ConstTensor(const std::string &name,
const
unsigned
char
*
data
,
const
std
::
vector
<
int64_t
>
&
dims
,
const
int
data_type
,
uint32_t
node_id
)
:
name_
(
name
),
uint32_t
node_id
)
:
name_
(
name
),
data_
(
data
),
data_size_
(
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
1
,
std
::
multiplies
<
int64_t
>
())),
data_size_
(
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
1
,
std
::
multiplies
<
int64_t
>
())),
dims_
(
dims
.
begin
(),
dims
.
end
()),
data_type_
(
static_cast
<
DataType
>
(
data_type
)),
node_id_
(
node_id
)
{}
const
std
::
string
&
ConstTensor
::
name
()
const
{
return
name_
;
}
const
unsigned
char
*
ConstTensor
::
data
()
const
{
return
data_
;
}
int64_t
ConstTensor
::
data_size
()
const
{
return
data_size_
;
}
const
std
::
vector
<
int64_t
>
&
ConstTensor
::
dims
()
const
{
return
dims_
;
}
DataType
ConstTensor
::
data_type
()
const
{
return
data_type_
;
}
uint32_t
ConstTensor
::
node_id
()
const
{
return
node_id_
;
}
const
std
::
string
&
ConstTensor
::
name
()
const
{
return
name_
;
}
const
unsigned
char
*
ConstTensor
::
data
()
const
{
return
data_
;
}
int64_t
ConstTensor
::
data_size
()
const
{
return
data_size_
;
}
const
std
::
vector
<
int64_t
>
&
ConstTensor
::
dims
()
const
{
return
dims_
;
}
DataType
ConstTensor
::
data_type
()
const
{
return
data_type_
;
}
uint32_t
ConstTensor
::
node_id
()
const
{
return
node_id_
;
}
Argument
::
Argument
()
:
has_bits_
(
0
)
{}
...
...
@@ -73,74 +61,42 @@ void Argument::CopyFrom(const Argument &from) {
this
->
has_bits_
=
from
.
has_bits_
;
}
const
std
::
string
&
Argument
::
name
()
const
{
return
name_
;
}
void
Argument
::
set_name
(
const
std
::
string
&
value
)
{
name_
=
value
;
}
bool
Argument
::
has_f
()
const
{
return
(
has_bits_
&
0x00000001u
)
!=
0
;
}
void
Argument
::
set_has_f
()
{
has_bits_
|=
0x00000001u
;
}
float
Argument
::
f
()
const
{
return
f_
;
}
const
std
::
string
&
Argument
::
name
()
const
{
return
name_
;
}
void
Argument
::
set_name
(
const
std
::
string
&
value
)
{
name_
=
value
;
}
bool
Argument
::
has_f
()
const
{
return
(
has_bits_
&
0x00000001u
)
!=
0
;
}
void
Argument
::
set_has_f
()
{
has_bits_
|=
0x00000001u
;
}
float
Argument
::
f
()
const
{
return
f_
;
}
void
Argument
::
set_f
(
float
value
)
{
set_has_f
();
f_
=
value
;
}
bool
Argument
::
has_i
()
const
{
return
(
has_bits_
&
0x00000002u
)
!=
0
;
}
void
Argument
::
set_has_i
()
{
has_bits_
|=
0x00000002u
;
}
int64_t
Argument
::
i
()
const
{
return
i_
;
}
bool
Argument
::
has_i
()
const
{
return
(
has_bits_
&
0x00000002u
)
!=
0
;
}
void
Argument
::
set_has_i
()
{
has_bits_
|=
0x00000002u
;
}
int64_t
Argument
::
i
()
const
{
return
i_
;
}
void
Argument
::
set_i
(
int64_t
value
)
{
set_has_i
();
i_
=
value
;
}
bool
Argument
::
has_s
()
const
{
return
(
has_bits_
&
0x00000004u
)
!=
0
;
}
void
Argument
::
set_has_s
()
{
has_bits_
|=
0x00000004u
;
}
std
::
string
Argument
::
s
()
const
{
return
s_
;
}
bool
Argument
::
has_s
()
const
{
return
(
has_bits_
&
0x00000004u
)
!=
0
;
}
void
Argument
::
set_has_s
()
{
has_bits_
|=
0x00000004u
;
}
std
::
string
Argument
::
s
()
const
{
return
s_
;
}
void
Argument
::
set_s
(
const
std
::
string
&
value
)
{
set_has_s
();
s_
=
value
;
}
const
std
::
vector
<
float
>
&
Argument
::
floats
()
const
{
return
floats_
;
}
void
Argument
::
add_floats
(
float
value
)
{
floats_
.
push_back
(
value
);
}
const
std
::
vector
<
float
>
&
Argument
::
floats
()
const
{
return
floats_
;
}
void
Argument
::
add_floats
(
float
value
)
{
floats_
.
push_back
(
value
);
}
void
Argument
::
set_floats
(
const
std
::
vector
<
float
>
&
value
)
{
floats_
.
resize
(
value
.
size
());
std
::
copy
(
value
.
begin
(),
value
.
end
(),
floats_
.
begin
());
}
const
std
::
vector
<
int64_t
>
&
Argument
::
ints
()
const
{
return
ints_
;
}
void
Argument
::
add_ints
(
int64_t
value
)
{
ints_
.
push_back
(
value
);
}
const
std
::
vector
<
int64_t
>
&
Argument
::
ints
()
const
{
return
ints_
;
}
void
Argument
::
add_ints
(
int64_t
value
)
{
ints_
.
push_back
(
value
);
}
void
Argument
::
set_ints
(
const
std
::
vector
<
int64_t
>
&
value
)
{
ints_
.
resize
(
value
.
size
());
std
::
copy
(
value
.
begin
(),
value
.
end
(),
ints_
.
begin
());
}
const
std
::
vector
<
std
::
string
>
&
Argument
::
strings
()
const
{
return
strings_
;
}
const
std
::
vector
<
std
::
string
>
&
Argument
::
strings
()
const
{
return
strings_
;
}
void
Argument
::
add_strings
(
const
::
std
::
string
&
value
)
{
strings_
.
push_back
(
value
);
}
...
...
@@ -156,31 +112,21 @@ void NodeInput::CopyFrom(const NodeInput &from) {
node_id_
=
from
.
node_id
();
output_port_
=
from
.
output_port
();
}
int
NodeInput
::
node_id
()
const
{
return
node_id_
;
}
void
NodeInput
::
set_node_id
(
int
node_id
)
{
node_id_
=
node_id
;
}
int
NodeInput
::
output_port
()
const
{
return
output_port_
;
}
void
NodeInput
::
set_output_port
(
int
output_port
)
{
output_port_
=
output_port
;
}
int
NodeInput
::
node_id
()
const
{
return
node_id_
;
}
void
NodeInput
::
set_node_id
(
int
node_id
)
{
node_id_
=
node_id
;
}
int
NodeInput
::
output_port
()
const
{
return
output_port_
;
}
void
NodeInput
::
set_output_port
(
int
output_port
)
{
output_port_
=
output_port
;
}
// OutputShape
OutputShape
::
OutputShape
()
{}
OutputShape
::
OutputShape
(
const
std
::
vector
<
int64_t
>
&
dims
)
:
dims_
(
dims
.
begin
(),
dims
.
end
())
{}
OutputShape
::
OutputShape
(
const
std
::
vector
<
int64_t
>
&
dims
)
:
dims_
(
dims
.
begin
(),
dims
.
end
())
{}
void
OutputShape
::
CopyFrom
(
const
OutputShape
&
from
)
{
auto
from_dims
=
from
.
dims
();
dims_
.
resize
(
from_dims
.
size
());
std
::
copy
(
from_dims
.
begin
(),
from_dims
.
end
(),
dims_
.
begin
());
}
const
std
::
vector
<
int64_t
>
&
OutputShape
::
dims
()
const
{
return
dims_
;
}
const
std
::
vector
<
int64_t
>
&
OutputShape
::
dims
()
const
{
return
dims_
;
}
// Operator Def
void
OperatorDef
::
CopyFrom
(
const
OperatorDef
&
from
)
{
...
...
@@ -220,68 +166,38 @@ void OperatorDef::CopyFrom(const OperatorDef &from) {
}
auto
from_out_max_byte_size
=
from
.
out_max_byte_size
();
out_max_byte_size_
.
resize
(
from_out_max_byte_size
.
size
());
std
::
copy
(
from_out_max_byte_size
.
begin
(),
from_out_max_byte_size
.
end
(),
std
::
copy
(
from_out_max_byte_size
.
begin
(),
from_out_max_byte_size
.
end
(),
out_max_byte_size_
.
begin
());
has_bits_
=
from
.
has_bits_
;
}
const
std
::
string
&
OperatorDef
::
name
()
const
{
return
name_
;
}
const
std
::
string
&
OperatorDef
::
name
()
const
{
return
name_
;
}
void
OperatorDef
::
set_name
(
const
std
::
string
&
name_
)
{
set_has_name
();
OperatorDef
::
name_
=
name_
;
}
bool
OperatorDef
::
has_name
()
const
{
return
(
has_bits_
&
0x00000001u
)
!=
0
;
}
void
OperatorDef
::
set_has_name
()
{
has_bits_
|=
0x00000001u
;
}
const
std
::
string
&
OperatorDef
::
type
()
const
{
return
type_
;
}
bool
OperatorDef
::
has_name
()
const
{
return
(
has_bits_
&
0x00000001u
)
!=
0
;
}
void
OperatorDef
::
set_has_name
()
{
has_bits_
|=
0x00000001u
;
}
const
std
::
string
&
OperatorDef
::
type
()
const
{
return
type_
;
}
void
OperatorDef
::
set_type
(
const
std
::
string
&
type_
)
{
set_has_type
();
OperatorDef
::
type_
=
type_
;
}
bool
OperatorDef
::
has_type
()
const
{
return
(
has_bits_
&
0x00000002u
)
!=
0
;
}
void
OperatorDef
::
set_has_type
()
{
has_bits_
|=
0x00000002u
;
}
int
OperatorDef
::
mem_id
()
const
{
return
mem_id_
;
}
bool
OperatorDef
::
has_type
()
const
{
return
(
has_bits_
&
0x00000002u
)
!=
0
;
}
void
OperatorDef
::
set_has_type
()
{
has_bits_
|=
0x00000002u
;
}
int
OperatorDef
::
mem_id
()
const
{
return
mem_id_
;
}
void
OperatorDef
::
set_mem_id
(
const
int
mem_id
)
{
set_has_mem_id
();
mem_id_
=
mem_id
;
}
bool
OperatorDef
::
has_mem_id
()
const
{
return
(
has_bits_
&
0x00000004u
)
!=
0
;
}
void
OperatorDef
::
set_has_mem_id
()
{
has_bits_
|=
0x00000004u
;
}
uint32_t
OperatorDef
::
node_id
()
const
{
return
node_id_
;
}
void
OperatorDef
::
set_node_id
(
uint32_t
node_id
)
{
node_id_
=
node_id
;
}
uint32_t
OperatorDef
::
op_id
()
const
{
return
op_id_
;
}
uint32_t
OperatorDef
::
padding
()
const
{
return
padding_
;
}
void
OperatorDef
::
set_padding
(
uint32_t
padding
)
{
padding_
=
padding
;
}
bool
OperatorDef
::
has_mem_id
()
const
{
return
(
has_bits_
&
0x00000004u
)
!=
0
;
}
void
OperatorDef
::
set_has_mem_id
()
{
has_bits_
|=
0x00000004u
;
}
uint32_t
OperatorDef
::
node_id
()
const
{
return
node_id_
;
}
void
OperatorDef
::
set_node_id
(
uint32_t
node_id
)
{
node_id_
=
node_id
;
}
uint32_t
OperatorDef
::
op_id
()
const
{
return
op_id_
;
}
uint32_t
OperatorDef
::
padding
()
const
{
return
padding_
;
}
void
OperatorDef
::
set_padding
(
uint32_t
padding
)
{
padding_
=
padding
;
}
const
std
::
vector
<
NodeInput
>
&
OperatorDef
::
node_input
()
const
{
return
node_input_
;
}
...
...
@@ -294,9 +210,7 @@ const std::vector<int> &OperatorDef::out_max_byte_size() const {
void
OperatorDef
::
add_out_max_byte_size
(
int
value
)
{
out_max_byte_size_
.
push_back
(
value
);
}
const
std
::
vector
<
std
::
string
>
&
OperatorDef
::
input
()
const
{
return
input_
;
}
const
std
::
vector
<
std
::
string
>
&
OperatorDef
::
input
()
const
{
return
input_
;
}
const
std
::
string
&
OperatorDef
::
input
(
int
index
)
const
{
MACE_CHECK
(
0
<=
index
&&
index
<=
input_
.
size
());
return
input_
[
index
];
...
...
@@ -308,16 +222,12 @@ std::string *OperatorDef::add_input() {
void
OperatorDef
::
add_input
(
const
::
std
::
string
&
value
)
{
input_
.
push_back
(
value
);
}
void
OperatorDef
::
add_input
(
::
std
::
string
&&
value
)
{
input_
.
push_back
(
value
);
}
void
OperatorDef
::
add_input
(
::
std
::
string
&&
value
)
{
input_
.
push_back
(
value
);
}
void
OperatorDef
::
set_input
(
const
std
::
vector
<
std
::
string
>
&
value
)
{
input_
.
resize
(
value
.
size
());
std
::
copy
(
value
.
begin
(),
value
.
end
(),
input_
.
begin
());
}
const
std
::
vector
<
std
::
string
>
&
OperatorDef
::
output
()
const
{
return
output_
;
}
const
std
::
vector
<
std
::
string
>
&
OperatorDef
::
output
()
const
{
return
output_
;
}
const
std
::
string
&
OperatorDef
::
output
(
int
index
)
const
{
MACE_CHECK
(
0
<=
index
&&
index
<=
output_
.
size
());
return
output_
[
index
];
...
...
@@ -336,9 +246,7 @@ void OperatorDef::set_output(const std::vector<std::string> &value) {
output_
.
resize
(
value
.
size
());
std
::
copy
(
value
.
begin
(),
value
.
end
(),
output_
.
begin
());
}
const
std
::
vector
<
Argument
>
&
OperatorDef
::
arg
()
const
{
return
arg_
;
}
const
std
::
vector
<
Argument
>
&
OperatorDef
::
arg
()
const
{
return
arg_
;
}
Argument
*
OperatorDef
::
add_arg
()
{
arg_
.
emplace_back
(
Argument
());
return
&
arg_
.
back
();
...
...
@@ -358,18 +266,12 @@ void OperatorDef::set_output_type(const std::vector<DataType> &value) {
}
// MemoryBlock
MemoryBlock
::
MemoryBlock
(
int
mem_id
,
uint32_t
x
,
uint32_t
y
)
:
mem_id_
(
mem_id
),
x_
(
x
),
y_
(
y
)
{}
MemoryBlock
::
MemoryBlock
(
int
mem_id
,
uint32_t
x
,
uint32_t
y
)
:
mem_id_
(
mem_id
),
x_
(
x
),
y_
(
y
)
{}
int
MemoryBlock
::
mem_id
()
const
{
return
mem_id_
;
}
uint32_t
MemoryBlock
::
x
()
const
{
return
x_
;
}
uint32_t
MemoryBlock
::
y
()
const
{
return
y_
;
}
int
MemoryBlock
::
mem_id
()
const
{
return
mem_id_
;
}
uint32_t
MemoryBlock
::
x
()
const
{
return
x_
;
}
uint32_t
MemoryBlock
::
y
()
const
{
return
y_
;
}
// MemoryArena
const
std
::
vector
<
MemoryBlock
>
&
MemoryArena
::
mem_block
()
const
{
...
...
@@ -378,131 +280,69 @@ const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
std
::
vector
<
MemoryBlock
>
&
MemoryArena
::
mutable_mem_block
()
{
return
mem_block_
;
}
int
MemoryArena
::
mem_block_size
()
const
{
return
mem_block_
.
size
();
}
int
MemoryArena
::
mem_block_size
()
const
{
return
mem_block_
.
size
();
}
// InputInfo
const
std
::
string
&
InputInfo
::
name
()
const
{
return
name_
;
}
int32_t
InputInfo
::
node_id
()
const
{
return
node_id_
;
}
int32_t
InputInfo
::
max_byte_size
()
const
{
return
max_byte_size_
;
}
DataType
InputInfo
::
data_type
()
const
{
return
data_type_
;
}
const
std
::
vector
<
int32_t
>
&
InputInfo
::
dims
()
const
{
return
dims_
;
}
const
std
::
string
&
InputInfo
::
name
()
const
{
return
name_
;
}
int32_t
InputInfo
::
node_id
()
const
{
return
node_id_
;
}
int32_t
InputInfo
::
max_byte_size
()
const
{
return
max_byte_size_
;
}
DataType
InputInfo
::
data_type
()
const
{
return
data_type_
;
}
const
std
::
vector
<
int32_t
>
&
InputInfo
::
dims
()
const
{
return
dims_
;
}
// OutputInfo
const
std
::
string
&
OutputInfo
::
name
()
const
{
return
name_
;
}
int32_t
OutputInfo
::
node_id
()
const
{
return
node_id_
;
}
int32_t
OutputInfo
::
max_byte_size
()
const
{
return
max_byte_size_
;
}
DataType
OutputInfo
::
data_type
()
const
{
return
data_type_
;
}
void
OutputInfo
::
set_data_type
(
DataType
data_type
)
{
data_type_
=
data_type
;
}
const
std
::
vector
<
int32_t
>
&
OutputInfo
::
dims
()
const
{
return
dims_
;
}
void
OutputInfo
::
set_dims
(
const
std
::
vector
<
int32_t
>
&
dims
)
{
dims_
=
dims
;
}
const
std
::
string
&
OutputInfo
::
name
()
const
{
return
name_
;
}
int32_t
OutputInfo
::
node_id
()
const
{
return
node_id_
;
}
int32_t
OutputInfo
::
max_byte_size
()
const
{
return
max_byte_size_
;
}
DataType
OutputInfo
::
data_type
()
const
{
return
data_type_
;
}
void
OutputInfo
::
set_data_type
(
DataType
data_type
)
{
data_type_
=
data_type
;
}
const
std
::
vector
<
int32_t
>
&
OutputInfo
::
dims
()
const
{
return
dims_
;
}
void
OutputInfo
::
set_dims
(
const
std
::
vector
<
int32_t
>
&
dims
)
{
dims_
=
dims
;
}
// NetDef
NetDef
::
NetDef
()
:
has_bits_
(
0
)
{}
const
std
::
string
&
NetDef
::
name
()
const
{
return
name_
;
}
const
std
::
string
&
NetDef
::
name
()
const
{
return
name_
;
}
void
NetDef
::
set_name
(
const
std
::
string
&
value
)
{
set_has_name
();
name_
=
value
;
}
bool
NetDef
::
has_name
()
const
{
return
(
has_bits_
&
0x00000001u
)
!=
0
;
}
void
NetDef
::
set_has_name
()
{
has_bits_
|=
0x00000001u
;
}
const
std
::
string
&
NetDef
::
version
()
const
{
return
version_
;
}
bool
NetDef
::
has_name
()
const
{
return
(
has_bits_
&
0x00000001u
)
!=
0
;
}
void
NetDef
::
set_has_name
()
{
has_bits_
|=
0x00000001u
;
}
const
std
::
string
&
NetDef
::
version
()
const
{
return
version_
;
}
void
NetDef
::
set_version
(
const
std
::
string
&
value
)
{
set_has_version
();
version_
=
value
;
}
bool
NetDef
::
has_version
()
const
{
return
(
has_bits_
&
0x00000002u
)
!=
0
;
}
void
NetDef
::
set_has_version
()
{
has_bits_
|=
0x00000002u
;
}
const
std
::
vector
<
OperatorDef
>
&
NetDef
::
op
()
const
{
return
op_
;
}
bool
NetDef
::
has_version
()
const
{
return
(
has_bits_
&
0x00000002u
)
!=
0
;
}
void
NetDef
::
set_has_version
()
{
has_bits_
|=
0x00000002u
;
}
const
std
::
vector
<
OperatorDef
>
&
NetDef
::
op
()
const
{
return
op_
;
}
OperatorDef
*
NetDef
::
add_op
()
{
op_
.
emplace_back
(
OperatorDef
());
return
&
op_
.
back
();
}
std
::
vector
<
OperatorDef
>
&
NetDef
::
mutable_op
()
{
return
op_
;
}
const
std
::
vector
<
Argument
>
&
NetDef
::
arg
()
const
{
return
arg_
;
}
std
::
vector
<
OperatorDef
>
&
NetDef
::
mutable_op
()
{
return
op_
;
}
const
std
::
vector
<
Argument
>
&
NetDef
::
arg
()
const
{
return
arg_
;
}
Argument
*
NetDef
::
add_arg
()
{
arg_
.
emplace_back
(
Argument
());
return
&
arg_
.
back
();
}
std
::
vector
<
Argument
>
&
NetDef
::
mutable_arg
()
{
return
arg_
;
}
const
std
::
vector
<
ConstTensor
>
&
NetDef
::
tensors
()
const
{
return
tensors_
;
}
std
::
vector
<
ConstTensor
>
&
NetDef
::
mutable_tensors
()
{
return
tensors_
;
}
const
MemoryArena
&
NetDef
::
mem_arena
()
const
{
return
mem_arena_
;
}
std
::
vector
<
Argument
>
&
NetDef
::
mutable_arg
()
{
return
arg_
;
}
const
std
::
vector
<
ConstTensor
>
&
NetDef
::
tensors
()
const
{
return
tensors_
;
}
std
::
vector
<
ConstTensor
>
&
NetDef
::
mutable_tensors
()
{
return
tensors_
;
}
const
MemoryArena
&
NetDef
::
mem_arena
()
const
{
return
mem_arena_
;
}
MemoryArena
&
NetDef
::
mutable_mem_arena
()
{
set_has_mem_arena
();
return
mem_arena_
;
}
bool
NetDef
::
has_mem_arena
()
const
{
return
(
has_bits_
&
0x00000004u
)
!=
0
;
}
void
NetDef
::
set_has_mem_arena
()
{
has_bits_
|=
0x00000004u
;
}
const
std
::
vector
<
InputInfo
>
&
NetDef
::
input_info
()
const
{
return
input_info_
;
}
bool
NetDef
::
has_mem_arena
()
const
{
return
(
has_bits_
&
0x00000004u
)
!=
0
;
}
void
NetDef
::
set_has_mem_arena
()
{
has_bits_
|=
0x00000004u
;
}
const
std
::
vector
<
InputInfo
>
&
NetDef
::
input_info
()
const
{
return
input_info_
;
}
const
std
::
vector
<
OutputInfo
>
&
NetDef
::
output_info
()
const
{
return
output_info_
;
}
std
::
vector
<
OutputInfo
>
&
NetDef
::
mutable_output_info
()
{
return
output_info_
;
}
std
::
vector
<
OutputInfo
>
&
NetDef
::
mutable_output_info
()
{
return
output_info_
;
}
int
NetDef
::
op_size
()
const
{
return
op_
.
size
();
}
int
NetDef
::
op_size
()
const
{
return
op_
.
size
();
}
const
OperatorDef
&
NetDef
::
op
(
const
int
idx
)
const
{
MACE_CHECK
(
0
<=
idx
&&
idx
<
op_size
());
...
...
@@ -510,14 +350,15 @@ const OperatorDef &NetDef::op(const int idx) const {
}
// Mace Engine
MaceEngine
::
MaceEngine
(
const
NetDef
*
net_def
,
DeviceType
device_type
)
:
op_registry_
(
new
OperatorRegistry
()),
device_type_
(
device_type
),
ws_
(
new
Workspace
()),
net_
(
nullptr
),
hexagon_controller_
(
nullptr
)
{
ws_
->
CreateTensor
(
"mace_input_node:0"
,
GetDeviceAllocator
(
device_type_
),
MaceEngine
::
MaceEngine
(
const
NetDef
*
net_def
,
DeviceType
device_type
)
:
op_registry_
(
new
OperatorRegistry
()),
device_type_
(
device_type
),
ws_
(
new
Workspace
()),
net_
(
nullptr
),
hexagon_controller_
(
nullptr
)
{
ws_
->
CreateTensor
(
"mace_input_node:0"
,
GetDeviceAllocator
(
device_type_
),
DT_FLOAT
);
ws_
->
CreateTensor
(
"mace_output_node:0"
,
GetDeviceAllocator
(
device_type_
),
ws_
->
CreateTensor
(
"mace_output_node:0"
,
GetDeviceAllocator
(
device_type_
),
DT_FLOAT
);
if
(
device_type
==
HEXAGON
)
{
hexagon_controller_
.
reset
(
new
HexagonControlWrapper
());
...
...
@@ -525,8 +366,8 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
MACE_CHECK
(
hexagon_controller_
->
Init
(),
"hexagon init error"
);
hexagon_controller_
->
SetDebugLevel
(
static_cast
<
int
>
(
mace
::
logging
::
LogMessage
::
MinVLogLevel
()));
int
dsp_mode
=
ArgumentHelper
::
GetSingleArgument
<
NetDef
,
int
>
(
*
net_def
,
"dsp_mode"
,
0
);
int
dsp_mode
=
ArgumentHelper
::
GetSingleArgument
<
NetDef
,
int
>
(
*
net_def
,
"dsp_mode"
,
0
);
hexagon_controller_
->
SetGraphMode
(
dsp_mode
);
MACE_CHECK
(
hexagon_controller_
->
SetupGraph
(
*
net_def
),
"hexagon setup graph error"
);
...
...
@@ -537,8 +378,8 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
ws_
->
LoadModelTensor
(
*
net_def
,
device_type
);
// Init model
auto
net
=
CreateNet
(
op_registry_
,
*
net_def
,
ws_
.
get
(),
device_type
,
NetMode
::
INIT
);
auto
net
=
CreateNet
(
op_registry_
,
*
net_def
,
ws_
.
get
(),
device_type
,
NetMode
::
INIT
);
if
(
!
net
->
Run
())
{
LOG
(
FATAL
)
<<
"Net init run failed"
;
}
...
...
@@ -548,18 +389,19 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
MaceEngine
::
MaceEngine
(
const
NetDef
*
net_def
,
DeviceType
device_type
,
const
std
::
vector
<
std
::
string
>
&
input_nodes
,
const
std
::
vector
<
std
::
string
>
&
output_nodes
)
:
op_registry_
(
new
OperatorRegistry
()),
device_type_
(
device_type
),
ws_
(
new
Workspace
()),
net_
(
nullptr
),
hexagon_controller_
(
nullptr
)
{
const
std
::
vector
<
std
::
string
>
&
output_nodes
)
:
op_registry_
(
new
OperatorRegistry
()),
device_type_
(
device_type
),
ws_
(
new
Workspace
()),
net_
(
nullptr
),
hexagon_controller_
(
nullptr
)
{
for
(
auto
input_name
:
input_nodes
)
{
ws_
->
CreateTensor
(
MakeString
(
"mace_input_node_"
,
input_name
,
":0"
),
GetDeviceAllocator
(
device_type_
),
DT_FLOAT
);
GetDeviceAllocator
(
device_type_
),
DT_FLOAT
);
}
for
(
auto
output_name
:
output_nodes
)
{
ws_
->
CreateTensor
(
MakeString
(
"mace_output_node_"
,
output_name
,
":0"
),
GetDeviceAllocator
(
device_type_
),
DT_FLOAT
);
GetDeviceAllocator
(
device_type_
),
DT_FLOAT
);
}
if
(
device_type
==
HEXAGON
)
{
hexagon_controller_
.
reset
(
new
HexagonControlWrapper
());
...
...
@@ -567,8 +409,8 @@ MaceEngine::MaceEngine(const NetDef *net_def,
MACE_CHECK
(
hexagon_controller_
->
Init
(),
"hexagon init error"
);
hexagon_controller_
->
SetDebugLevel
(
static_cast
<
int
>
(
mace
::
logging
::
LogMessage
::
MinVLogLevel
()));
int
dsp_mode
=
ArgumentHelper
::
GetSingleArgument
<
NetDef
,
int
>
(
*
net_def
,
"dsp_mode"
,
0
);
int
dsp_mode
=
ArgumentHelper
::
GetSingleArgument
<
NetDef
,
int
>
(
*
net_def
,
"dsp_mode"
,
0
);
hexagon_controller_
->
SetGraphMode
(
dsp_mode
);
MACE_CHECK
(
hexagon_controller_
->
SetupGraph
(
*
net_def
),
"hexagon setup graph error"
);
...
...
@@ -579,14 +421,13 @@ MaceEngine::MaceEngine(const NetDef *net_def,
ws_
->
LoadModelTensor
(
*
net_def
,
device_type
);
// Init model
auto
net
=
CreateNet
(
op_registry_
,
*
net_def
,
ws_
.
get
(),
device_type
,
NetMode
::
INIT
);
auto
net
=
CreateNet
(
op_registry_
,
*
net_def
,
ws_
.
get
(),
device_type
,
NetMode
::
INIT
);
if
(
!
net
->
Run
())
{
LOG
(
FATAL
)
<<
"Net init run failed"
;
}
net_
=
std
::
move
(
CreateNet
(
op_registry_
,
*
net_def
,
ws_
.
get
(),
device_type
));
}
}
MaceEngine
::~
MaceEngine
()
{
if
(
device_type_
==
HEXAGON
)
{
...
...
@@ -643,10 +484,11 @@ bool MaceEngine::Run(const float *input,
bool
MaceEngine
::
Run
(
const
std
::
vector
<
MaceInputInfo
>
&
inputs
,
std
::
map
<
std
::
string
,
float
*>
&
outputs
,
RunMetadata
*
run_metadata
)
{
MACE_CHECK
(
device_type_
!=
HEXAGON
,
"HEXAGON not supports multiple outputs now"
);
MACE_CHECK
(
device_type_
!=
HEXAGON
,
"HEXAGON not supports multiple outputs now"
);
for
(
auto
input
:
inputs
)
{
Tensor
*
input_tensor
=
ws_
->
GetTensor
(
MakeString
(
"mace_input_node_"
,
input
.
name
,
":0"
));
Tensor
*
input_tensor
=
ws_
->
GetTensor
(
MakeString
(
"mace_input_node_"
,
input
.
name
,
":0"
));
input_tensor
->
Resize
(
input
.
shape
);
{
Tensor
::
MappingGuard
input_guard
(
input_tensor
);
...
...
@@ -658,7 +500,8 @@ bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs,
LOG
(
FATAL
)
<<
"Net run failed"
;
}
for
(
auto
output
:
outputs
)
{
Tensor
*
output_tensor
=
ws_
->
GetTensor
(
MakeString
(
"mace_output_node_"
,
output
.
first
+
":0"
));
Tensor
*
output_tensor
=
ws_
->
GetTensor
(
MakeString
(
"mace_output_node_"
,
output
.
first
+
":0"
));
// save output
if
(
output_tensor
!=
nullptr
&&
output
.
second
!=
nullptr
)
{
Tensor
::
MappingGuard
output_guard
(
output_tensor
);
...
...
mace/core/net.cc
浏览文件 @
4410ecd2
...
...
@@ -3,9 +3,9 @@
//
#include "mace/core/net.h"
#include "mace/utils/utils.h"
#include "mace/utils/timer.h"
#include "mace/utils/memory_logging.h"
#include "mace/utils/timer.h"
#include "mace/utils/utils.h"
namespace
mace
{
...
...
@@ -20,8 +20,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
Workspace
*
ws
,
DeviceType
type
,
const
NetMode
mode
)
:
NetBase
(
op_registry
,
net_def
,
ws
,
type
),
device_type_
(
type
)
{
:
NetBase
(
op_registry
,
net_def
,
ws
,
type
),
device_type_
(
type
)
{
MACE_LATENCY_LOGGER
(
1
,
"Constructing SerialNet "
,
net_def
->
name
());
for
(
int
idx
=
0
;
idx
<
net_def
->
op_size
();
++
idx
)
{
const
auto
&
operator_def
=
net_def
->
op
(
idx
);
...
...
@@ -41,8 +40,8 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
MACE_LATENCY_LOGGER
(
1
,
"Running net"
);
for
(
auto
iter
=
operators_
.
begin
();
iter
!=
operators_
.
end
();
++
iter
)
{
auto
&
op
=
*
iter
;
MACE_LATENCY_LOGGER
(
2
,
"Running operator "
,
op
->
debug_def
().
name
(),
"("
,
op
->
debug_def
().
type
(),
")"
);
MACE_LATENCY_LOGGER
(
2
,
"Running operator "
,
op
->
debug_def
().
name
(),
"("
,
op
->
debug_def
().
type
(),
")"
);
bool
future_wait
=
(
device_type_
==
DeviceType
::
OPENCL
&&
(
run_metadata
!=
nullptr
||
std
::
distance
(
iter
,
operators_
.
end
())
==
1
));
...
...
@@ -99,7 +98,8 @@ std::unique_ptr<NetBase> CreateNet(
Workspace
*
ws
,
DeviceType
type
,
const
NetMode
mode
)
{
std
::
unique_ptr
<
NetBase
>
net
(
new
SerialNet
(
op_registry
,
net_def
,
ws
,
type
,
mode
));
std
::
unique_ptr
<
NetBase
>
net
(
new
SerialNet
(
op_registry
,
net_def
,
ws
,
type
,
mode
));
return
net
;
}
...
...
mace/core/operator.h
浏览文件 @
4410ecd2
...
...
@@ -7,10 +7,10 @@
#include "mace/core/arg_helper.h"
#include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/registry.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
#include "mace/public/mace.h"
namespace
mace
{
...
...
mace/core/preallocated_pooled_allocator.h
浏览文件 @
4410ecd2
mace/core/runtime/hexagon/hexagon_control_wrapper.cc
浏览文件 @
4410ecd2
...
...
@@ -2,19 +2,19 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <vector>
#include <thread>
#include <sys/time.h>
#include <thread>
#include <vector>
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
namespace
{
inline
int64_t
NowMicros
()
{
inline
int64_t
NowMicros
()
{
struct
timeval
tv
;
gettimeofday
(
&
tv
,
nullptr
);
return
static_cast
<
int64_t
>
(
tv
.
tv_sec
)
*
1000000
+
tv
.
tv_usec
;
}
}
}
namespace
mace
{
...
...
@@ -63,7 +63,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// const node
std
::
thread
const_thread
([
&
]()
{
std
::
vector
<
hexagon_nn_const_node
>
const_node_list
;
for
(
const
ConstTensor
&
const_tensor
:
net_def
.
tensors
())
{
for
(
const
ConstTensor
&
const_tensor
:
net_def
.
tensors
())
{
std
::
vector
<
int
>
tensor_shape
(
const_tensor
.
dims
().
begin
(),
const_tensor
.
dims
().
end
());
while
(
tensor_shape
.
size
()
<
4
)
{
...
...
@@ -77,30 +77,30 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
const_node
.
tensor
.
width
=
tensor_shape
[
2
];
const_node
.
tensor
.
depth
=
tensor_shape
[
3
];
if
(
const_tensor
.
data_type
()
==
DataType
::
DT_INT32
&&
const_tensor
.
data_size
()
==
0
)
{
if
(
const_tensor
.
data_type
()
==
DataType
::
DT_INT32
&&
const_tensor
.
data_size
()
==
0
)
{
const_node
.
tensor
.
data
=
NULL
;
const_node
.
tensor
.
dataLen
=
0
;
}
else
{
const_node
.
tensor
.
data
=
const_cast
<
unsigned
char
*>
(
const_tensor
.
data
());
const_node
.
tensor
.
dataLen
=
const_tensor
.
data_size
()
*
GetEnumTypeSize
(
const_tensor
.
data_type
());
const_node
.
tensor
.
dataLen
=
const_tensor
.
data_size
()
*
GetEnumTypeSize
(
const_tensor
.
data_type
());
}
const_node_list
.
push_back
(
const_node
);
// 255 is magic number: why fastrpc limits sequence length to that?
if
(
const_node_list
.
size
()
>=
250
)
{
MACE_CHECK
(
hexagon_nn_append_const_node_list
(
nn_id_
,
const_node_list
.
data
(),
const_node_list
.
size
())
==
0
,
"append const node error"
);
MACE_CHECK
(
hexagon_nn_append_const_node_list
(
nn_id_
,
const_node_list
.
data
(),
const_node_list
.
size
())
==
0
,
"append const node error"
);
const_node_list
.
clear
();
}
}
if
(
!
const_node_list
.
empty
())
{
MACE_CHECK
(
hexagon_nn_append_const_node_list
(
nn_id_
,
const_node_list
.
data
(),
MACE_CHECK
(
hexagon_nn_append_const_node_list
(
nn_id_
,
const_node_list
.
data
(),
const_node_list
.
size
())
==
0
,
"append const node error"
);
}
...
...
@@ -117,7 +117,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
std
::
vector
<
hexagon_nn_input
>
inputs
;
std
::
vector
<
hexagon_nn_output
>
outputs
;
for
(
const
OperatorDef
&
op
:
net_def
.
op
())
{
for
(
const
OperatorDef
&
op
:
net_def
.
op
())
{
int
op_id
=
op_map
.
GetOpId
(
op
.
type
());
inputs
.
resize
(
op
.
node_input
().
size
());
for
(
size_t
i
=
0
;
i
<
op
.
node_input
().
size
();
++
i
)
{
...
...
@@ -131,9 +131,8 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
cached_inputs
.
push_back
(
inputs
);
cached_outputs
.
push_back
(
outputs
);
hexagon_nn_padding_type
padding_type
=
static_cast
<
hexagon_nn_padding_type
>
(
op
.
padding
());
hexagon_nn_padding_type
padding_type
=
static_cast
<
hexagon_nn_padding_type
>
(
op
.
padding
());
hexagon_nn_op_node
op_node
;
op_node
.
node_id
=
node_id
(
op
.
node_id
());
...
...
@@ -146,8 +145,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
op_node_list
.
push_back
(
op_node
);
if
(
op_node_list
.
size
()
>=
125
)
{
MACE_CHECK
(
hexagon_nn_append_node_list
(
nn_id_
,
op_node_list
.
data
(),
MACE_CHECK
(
hexagon_nn_append_node_list
(
nn_id_
,
op_node_list
.
data
(),
op_node_list
.
size
())
==
0
,
"append node error"
);
op_node_list
.
clear
();
...
...
@@ -157,8 +155,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
}
if
(
!
op_node_list
.
empty
())
{
MACE_CHECK
(
hexagon_nn_append_node_list
(
nn_id_
,
op_node_list
.
data
(),
MACE_CHECK
(
hexagon_nn_append_node_list
(
nn_id_
,
op_node_list
.
data
(),
op_node_list
.
size
())
==
0
,
"append node error"
);
}
...
...
@@ -172,10 +169,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// input info
num_inputs_
=
0
;
for
(
const
InputInfo
&
input_info
:
net_def
.
input_info
())
{
for
(
const
InputInfo
&
input_info
:
net_def
.
input_info
())
{
std
::
vector
<
index_t
>
input_shape
;
input_shape
.
insert
(
input_shape
.
begin
(),
input_info
.
dims
().
begin
(),
input_info
.
dims
().
end
());
input_shape
.
insert
(
input_shape
.
begin
(),
input_info
.
dims
().
begin
(),
input_info
.
dims
().
end
());
while
(
input_shape
.
size
()
<
4
)
{
input_shape
.
insert
(
input_shape
.
begin
(),
1
);
}
...
...
@@ -186,10 +183,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
// output info
num_outputs_
=
0
;
for
(
const
OutputInfo
&
output_info
:
net_def
.
output_info
())
{
for
(
const
OutputInfo
&
output_info
:
net_def
.
output_info
())
{
std
::
vector
<
index_t
>
output_shape
;
output_shape
.
insert
(
output_shape
.
begin
(),
output_info
.
dims
().
begin
(),
output_info
.
dims
().
end
());
output_shape
.
insert
(
output_shape
.
begin
(),
output_info
.
dims
().
begin
(),
output_info
.
dims
().
end
());
while
(
output_shape
.
size
()
<
4
)
{
output_shape
.
insert
(
output_shape
.
begin
(),
1
);
}
...
...
@@ -218,27 +215,27 @@ bool HexagonControlWrapper::TeardownGraph() {
return
hexagon_nn_teardown
(
nn_id_
)
==
0
;
}
#define PRINT_BUFSIZE (2
*1024*
1024)
#define PRINT_BUFSIZE (2
* 1024 *
1024)
void
HexagonControlWrapper
::
PrintLog
()
{
char
*
buf
;
if
((
buf
=
new
char
[
PRINT_BUFSIZE
])
==
NULL
)
return
;
MACE_CHECK
(
hexagon_nn_getlog
(
nn_id_
,
reinterpret_cast
<
unsigned
char
*>
(
buf
)
,
PRINT_BUFSIZE
)
==
0
,
"print log error"
);
MACE_CHECK
(
hexagon_nn_getlog
(
nn_id_
,
reinterpret_cast
<
unsigned
char
*>
(
buf
),
PRINT_BUFSIZE
)
==
0
,
"print log error"
);
LOG
(
INFO
)
<<
std
::
string
(
buf
);
delete
[]
buf
;
delete
[]
buf
;
}
void
HexagonControlWrapper
::
PrintGraph
()
{
LOG
(
INFO
)
<<
"Print Graph"
;
char
*
buf
;
if
((
buf
=
new
char
[
PRINT_BUFSIZE
])
==
NULL
)
return
;
MACE_CHECK
(
hexagon_nn_snpprint
(
nn_id_
,
reinterpret_cast
<
unsigned
char
*>
(
buf
)
,
PRINT_BUFSIZE
)
==
0
,
"print graph error"
);
MACE_CHECK
(
hexagon_nn_snpprint
(
nn_id_
,
reinterpret_cast
<
unsigned
char
*>
(
buf
),
PRINT_BUFSIZE
)
==
0
,
"print graph error"
);
LOG
(
INFO
)
<<
std
::
string
(
buf
);
delete
[]
buf
;
delete
[]
buf
;
}
void
HexagonControlWrapper
::
SetDebugLevel
(
int
level
)
{
...
...
@@ -256,8 +253,8 @@ void HexagonControlWrapper::GetPerfInfo() {
LOG
(
INFO
)
<<
"Get perf info"
;
std
::
vector
<
hexagon_nn_perfinfo
>
perf_info
(
MAX_NODE
);
unsigned
int
n_items
=
0
;
MACE_CHECK
(
hexagon_nn_get_perfinfo
(
nn_id_
,
perf_info
.
data
(),
MAX_NODE
,
&
n_items
)
==
0
,
MACE_CHECK
(
hexagon_nn_get_perfinfo
(
nn_id_
,
perf_info
.
data
(),
MAX_NODE
,
&
n_items
)
==
0
,
"get perf info error"
);
std
::
unordered_map
<
uint32_t
,
float
>
node_id_counters
;
...
...
@@ -269,8 +266,9 @@ void HexagonControlWrapper::GetPerfInfo() {
unsigned
int
node_id
=
perf_info
[
i
].
node_id
;
unsigned
int
node_type_id
=
perf_info
[
i
].
node_type
;
node_id_counters
[
node_id
]
=
((
static_cast
<
uint64_t
>
(
perf_info
[
i
].
counter_hi
)
<<
32
)
+
perf_info
[
i
].
counter_lo
)
*
1.0
f
/
perf_info
[
i
].
executions
;
((
static_cast
<
uint64_t
>
(
perf_info
[
i
].
counter_hi
)
<<
32
)
+
perf_info
[
i
].
counter_lo
)
*
1.0
f
/
perf_info
[
i
].
executions
;
char
node_type_buf
[
MAX_NODE
];
hexagon_nn_op_id_to_name
(
node_type_id
,
node_type_buf
,
MAX_NODE
);
...
...
@@ -288,7 +286,7 @@ void HexagonControlWrapper::GetPerfInfo() {
total_duration
+=
node_id_counters
[
node_id
];
}
for
(
auto
&
node_type_counter
:
node_type_counters
)
{
for
(
auto
&
node_type_counter
:
node_type_counters
)
{
LOG
(
INFO
)
<<
"node type: "
<<
node_type_counter
.
first
<<
", time: "
<<
node_type_counter
.
second
.
first
<<
", duration: "
<<
node_type_counter
.
second
.
second
;
...
...
@@ -312,32 +310,24 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
output_tensor
->
Resize
(
output_shapes_
[
0
]);
std
::
vector
<
uint32_t
>
output_shape
(
4
);
uint32_t
output_bytes
;
int
res
=
hexagon_nn_execute
(
nn_id_
,
input_tensor
.
shape
()[
0
],
input_tensor
.
shape
()[
1
],
input_tensor
.
shape
()[
2
],
input_tensor
.
shape
()[
3
],
reinterpret_cast
<
const
unsigned
char
*>
(
input_tensor
.
raw_data
()),
input_tensor
.
raw_size
(),
&
output_shape
[
0
],
&
output_shape
[
1
],
&
output_shape
[
2
],
&
output_shape
[
3
],
reinterpret_cast
<
unsigned
char
*>
(
output_tensor
->
raw_mutable_data
()),
output_tensor
->
raw_size
(),
&
output_bytes
);
int
res
=
hexagon_nn_execute
(
nn_id_
,
input_tensor
.
shape
()[
0
],
input_tensor
.
shape
()[
1
],
input_tensor
.
shape
()[
2
],
input_tensor
.
shape
()[
3
],
reinterpret_cast
<
const
unsigned
char
*>
(
input_tensor
.
raw_data
()),
input_tensor
.
raw_size
(),
&
output_shape
[
0
],
&
output_shape
[
1
],
&
output_shape
[
2
],
&
output_shape
[
3
],
reinterpret_cast
<
unsigned
char
*>
(
output_tensor
->
raw_mutable_data
()),
output_tensor
->
raw_size
(),
&
output_bytes
);
MACE_CHECK
(
res
==
0
,
"execute error"
);
MACE_ASSERT
(
output_shape
==
output_shapes_
[
0
],
"wrong output shape inferred"
);
MACE_ASSERT
(
output_shape
==
output_shapes_
[
0
],
"wrong output shape inferred"
);
MACE_ASSERT
(
output_bytes
==
output_tensor
->
raw_size
(),
"wrong output bytes inferred."
);
return
res
==
0
;
};
bool
HexagonControlWrapper
::
ExecuteGraphNew
(
const
std
::
vector
<
Tensor
>
&
input_tensors
,
bool
HexagonControlWrapper
::
ExecuteGraphNew
(
const
std
::
vector
<
Tensor
>
&
input_tensors
,
std
::
vector
<
Tensor
>
*
output_tensors
)
{
LOG
(
INFO
)
<<
"Execute graph new: "
<<
nn_id_
;
int
num_inputs
=
input_tensors
.
size
();
...
...
@@ -369,8 +359,8 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
outputs
[
i
].
dataLen
=
(
*
output_tensors
)[
i
].
raw_size
();
}
int
res
=
hexagon_nn_execute_new
(
nn_id_
,
inputs
,
num_inputs
,
outputs
,
num_outputs
);
int
res
=
hexagon_nn_execute_new
(
nn_id_
,
inputs
,
num_inputs
,
outputs
,
num_outputs
);
for
(
int
i
=
0
;
i
<
num_outputs
;
++
i
)
{
std
::
vector
<
uint32_t
>
output_shape
{
outputs
[
i
].
batches
,
outputs
[
i
].
height
,
...
...
@@ -397,9 +387,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
float
*
min_in_data
=
input_tensors
[
1
].
mutable_data
<
float
>
();
input_tensors
[
2
].
Resize
({
1
,
1
,
1
,
1
});
float
*
max_in_data
=
input_tensors
[
2
].
mutable_data
<
float
>
();
quantizer_
.
Quantize
(
input_tensor
,
&
input_tensors
[
0
],
min_in_data
,
quantizer_
.
Quantize
(
input_tensor
,
&
input_tensors
[
0
],
min_in_data
,
max_in_data
);
if
(
!
ExecuteGraphNew
(
input_tensors
,
&
output_tensors
))
{
return
false
;
...
...
@@ -409,9 +397,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
const
float
*
min_out_data
=
output_tensors
[
1
].
data
<
float
>
();
const
float
*
max_out_data
=
output_tensors
[
2
].
data
<
float
>
();
quantizer_
.
DeQuantize
(
output_tensors
[
0
],
*
min_out_data
,
*
max_out_data
,
quantizer_
.
DeQuantize
(
output_tensors
[
0
],
*
min_out_data
,
*
max_out_data
,
output_tensor
);
return
true
;
}
...
...
mace/core/runtime/hexagon/hexagon_control_wrapper.h
浏览文件 @
4410ecd2
...
...
@@ -16,16 +16,17 @@ namespace mace {
class
HexagonControlWrapper
{
public:
HexagonControlWrapper
()
{};
HexagonControlWrapper
(){};
int
GetVersion
();
bool
Config
();
bool
Init
();
bool
Finalize
();
bool
SetupGraph
(
const
NetDef
&
net_def
);
bool
SetupGraph
(
const
NetDef
&
net_def
);
bool
ExecuteGraph
(
const
Tensor
&
input_tensor
,
Tensor
*
output_tensor
);
bool
ExecuteGraphNew
(
const
std
::
vector
<
Tensor
>
&
input_tensors
,
bool
ExecuteGraphNew
(
const
std
::
vector
<
Tensor
>
&
input_tensors
,
std
::
vector
<
Tensor
>
*
output_tensors
);
bool
ExecuteGraphPreQuantize
(
const
Tensor
&
input_tensor
,
Tensor
*
output_tensor
);
bool
ExecuteGraphPreQuantize
(
const
Tensor
&
input_tensor
,
Tensor
*
output_tensor
);
bool
TeardownGraph
();
void
PrintLog
();
...
...
@@ -38,9 +39,7 @@ class HexagonControlWrapper {
private:
static
constexpr
int
NODE_ID_OFFSET
=
10000
;
inline
uint32_t
node_id
(
uint32_t
nodeid
)
{
return
NODE_ID_OFFSET
+
nodeid
;
}
inline
uint32_t
node_id
(
uint32_t
nodeid
)
{
return
NODE_ID_OFFSET
+
nodeid
;
}
int
nn_id_
;
Quantizer
quantizer_
;
...
...
@@ -54,7 +53,6 @@ class HexagonControlWrapper {
DISABLE_COPY_AND_ASSIGN
(
HexagonControlWrapper
);
};
}
#endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
mace/core/runtime/hexagon/hexagon_controller_dummy.cc
浏览文件 @
4410ecd2
...
...
@@ -10,31 +10,145 @@ int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
return
0
;
}
int
hexagon_controller_DeInitHexagon
()
{
int
hexagon_controller_DeInitHexagon
()
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_config
)(
void
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_init
)(
void
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_debug_level
)(
hexagon_nn_nn_id
id
,
int
level
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_graph_mode
)(
hexagon_nn_nn_id
id
,
int
mode
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_snpprint
)(
hexagon_nn_nn_id
id
,
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_getlog
)(
hexagon_nn_nn_id
id
,
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_node
)(
hexagon_nn_nn_id
id
,
unsigned
int
node_id
,
unsigned
int
operation
,
hexagon_nn_padding_type
padding
,
const
hexagon_nn_input
*
inputs
,
int
inputsLen
,
const
hexagon_nn_output
*
outputs
,
int
outputsLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_node_list
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_op_node
*
ops
,
int
opsLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_const_node
)(
hexagon_nn_nn_id
id
,
unsigned
int
node_id
,
unsigned
int
batches
,
unsigned
int
height
,
unsigned
int
width
,
unsigned
int
depth
,
const
unsigned
char
*
data
,
int
dataLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_const_node_list
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_const_node
*
consts
,
int
constsLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_prepare
)(
hexagon_nn_nn_id
id
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_execute
)(
hexagon_nn_nn_id
id
,
unsigned
int
batches_in
,
unsigned
int
height_in
,
unsigned
int
width_in
,
unsigned
int
depth_in
,
const
unsigned
char
*
data_in
,
int
data_inLen
,
unsigned
int
*
batches_out
,
unsigned
int
*
height_out
,
unsigned
int
*
width_out
,
unsigned
int
*
depth_out
,
unsigned
char
*
data_out
,
int
data_outLen
,
unsigned
int
*
data_len_out
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_teardown
)(
hexagon_nn_nn_id
id
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_powersave_level
)(
unsigned
int
level
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_get_perfinfo
)(
hexagon_nn_nn_id
id
,
hexagon_nn_perfinfo
*
info_out
,
int
info_outLen
,
unsigned
int
*
n_items
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_reset_perfinfo
)(
hexagon_nn_nn_id
id
,
unsigned
int
event
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_last_execution_cycles
)(
hexagon_nn_nn_id
id
,
unsigned
int
*
cycles_lo
,
unsigned
int
*
cycles_hi
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_version
)(
int
*
ver
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_op_name_to_id
)(
const
char
*
name
,
unsigned
int
*
node_id
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_op_id_to_name
)(
unsigned
int
node_id
,
char
*
name
,
int
nameLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_disable_dcvs
)(
void
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_GetHexagonBinaryVersion
)(
int
*
ver
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_PrintLog
)(
const
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_execute_new
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_tensordef
*
inputs
,
int
inputsLen
,
hexagon_nn_tensordef
*
outputs
,
int
outputsLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_config
)(
void
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_init
)(
void
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_debug_level
)(
hexagon_nn_nn_id
id
,
int
level
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_graph_mode
)(
hexagon_nn_nn_id
id
,
int
mode
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_snpprint
)(
hexagon_nn_nn_id
id
,
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_getlog
)(
hexagon_nn_nn_id
id
,
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_node
)(
hexagon_nn_nn_id
id
,
unsigned
int
node_id
,
unsigned
int
operation
,
hexagon_nn_padding_type
padding
,
const
hexagon_nn_input
*
inputs
,
int
inputsLen
,
const
hexagon_nn_output
*
outputs
,
int
outputsLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_node_list
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_op_node
*
ops
,
int
opsLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_const_node
)(
hexagon_nn_nn_id
id
,
unsigned
int
node_id
,
unsigned
int
batches
,
unsigned
int
height
,
unsigned
int
width
,
unsigned
int
depth
,
const
unsigned
char
*
data
,
int
dataLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_const_node_list
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_const_node
*
consts
,
int
constsLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_prepare
)(
hexagon_nn_nn_id
id
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_execute
)(
hexagon_nn_nn_id
id
,
unsigned
int
batches_in
,
unsigned
int
height_in
,
unsigned
int
width_in
,
unsigned
int
depth_in
,
const
unsigned
char
*
data_in
,
int
data_inLen
,
unsigned
int
*
batches_out
,
unsigned
int
*
height_out
,
unsigned
int
*
width_out
,
unsigned
int
*
depth_out
,
unsigned
char
*
data_out
,
int
data_outLen
,
unsigned
int
*
data_len_out
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_teardown
)(
hexagon_nn_nn_id
id
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_powersave_level
)(
unsigned
int
level
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_get_perfinfo
)(
hexagon_nn_nn_id
id
,
hexagon_nn_perfinfo
*
info_out
,
int
info_outLen
,
unsigned
int
*
n_items
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_reset_perfinfo
)(
hexagon_nn_nn_id
id
,
unsigned
int
event
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_last_execution_cycles
)(
hexagon_nn_nn_id
id
,
unsigned
int
*
cycles_lo
,
unsigned
int
*
cycles_hi
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_version
)(
int
*
ver
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_op_name_to_id
)(
const
char
*
name
,
unsigned
int
*
node_id
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_op_id_to_name
)(
unsigned
int
node_id
,
char
*
name
,
int
nameLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_disable_dcvs
)(
void
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_GetHexagonBinaryVersion
)(
int
*
ver
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_PrintLog
)(
const
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_execute_new
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_tensordef
*
inputs
,
int
inputsLen
,
hexagon_nn_tensordef
*
outputs
,
int
outputsLen
)
__QAIC_HEADER_ATTRIBUTE
{
return
0
;
}
mace/core/runtime/hexagon/hexagon_nn.h
浏览文件 @
4410ecd2
...
...
@@ -30,7 +30,7 @@ extern "C" {
#define __QAIC_STRING1_OBJECT_DEFINED__
#define __STRING1_OBJECT__
typedef
struct
_cstring1_s
{
char
*
data
;
char
*
data
;
int
dataLen
;
}
_cstring1_t
;
...
...
@@ -71,7 +71,7 @@ struct hexagon_nn_tensordef {
unsigned
int
height
;
unsigned
int
width
;
unsigned
int
depth
;
unsigned
char
*
data
;
unsigned
char
*
data
;
int
dataLen
;
unsigned
int
data_valid_len
;
unsigned
int
unused
;
...
...
@@ -81,9 +81,9 @@ struct hexagon_nn_op_node {
unsigned
int
node_id
;
unsigned
int
operation
;
hexagon_nn_padding_type
padding
;
hexagon_nn_input
*
inputs
;
hexagon_nn_input
*
inputs
;
int
inputsLen
;
hexagon_nn_output
*
outputs
;
hexagon_nn_output
*
outputs
;
int
outputsLen
;
};
typedef
struct
hexagon_nn_const_node
hexagon_nn_const_node
;
...
...
@@ -91,30 +91,98 @@ struct hexagon_nn_const_node {
unsigned
int
node_id
;
hexagon_nn_tensordef
tensor
;
};
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_config
)(
void
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_init
)(
void
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_debug_level
)(
hexagon_nn_nn_id
id
,
int
level
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_graph_mode
)(
hexagon_nn_nn_id
id
,
int
mode
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_snpprint
)(
hexagon_nn_nn_id
id
,
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_getlog
)(
hexagon_nn_nn_id
id
,
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_node
)(
hexagon_nn_nn_id
id
,
unsigned
int
node_id
,
unsigned
int
operation
,
hexagon_nn_padding_type
padding
,
const
hexagon_nn_input
*
inputs
,
int
inputsLen
,
const
hexagon_nn_output
*
outputs
,
int
outputsLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_node_list
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_op_node
*
ops
,
int
opsLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_const_node
)(
hexagon_nn_nn_id
id
,
unsigned
int
node_id
,
unsigned
int
batches
,
unsigned
int
height
,
unsigned
int
width
,
unsigned
int
depth
,
const
unsigned
char
*
data
,
int
dataLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_const_node_list
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_const_node
*
consts
,
int
constsLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_prepare
)(
hexagon_nn_nn_id
id
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_execute
)(
hexagon_nn_nn_id
id
,
unsigned
int
batches_in
,
unsigned
int
height_in
,
unsigned
int
width_in
,
unsigned
int
depth_in
,
const
unsigned
char
*
data_in
,
int
data_inLen
,
unsigned
int
*
batches_out
,
unsigned
int
*
height_out
,
unsigned
int
*
width_out
,
unsigned
int
*
depth_out
,
unsigned
char
*
data_out
,
int
data_outLen
,
unsigned
int
*
data_len_out
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_teardown
)(
hexagon_nn_nn_id
id
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_powersave_level
)(
unsigned
int
level
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_get_perfinfo
)(
hexagon_nn_nn_id
id
,
hexagon_nn_perfinfo
*
info_out
,
int
info_outLen
,
unsigned
int
*
n_items
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_reset_perfinfo
)(
hexagon_nn_nn_id
id
,
unsigned
int
event
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_last_execution_cycles
)(
hexagon_nn_nn_id
id
,
unsigned
int
*
cycles_lo
,
unsigned
int
*
cycles_hi
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_version
)(
int
*
ver
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_op_name_to_id
)(
const
char
*
name
,
unsigned
int
*
node_id
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_op_id_to_name
)(
unsigned
int
node_id
,
char
*
name
,
int
nameLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_disable_dcvs
)(
void
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_GetHexagonBinaryVersion
)(
int
*
ver
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_PrintLog
)(
const
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_execute_new
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_tensordef
*
inputs
,
int
inputsLen
,
hexagon_nn_tensordef
*
outputs
,
int
outputsLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_config
)(
void
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_init
)(
void
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_debug_level
)(
hexagon_nn_nn_id
id
,
int
level
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_graph_mode
)(
hexagon_nn_nn_id
id
,
int
mode
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_snpprint
)(
hexagon_nn_nn_id
id
,
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_getlog
)(
hexagon_nn_nn_id
id
,
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_node
)(
hexagon_nn_nn_id
id
,
unsigned
int
node_id
,
unsigned
int
operation
,
hexagon_nn_padding_type
padding
,
const
hexagon_nn_input
*
inputs
,
int
inputsLen
,
const
hexagon_nn_output
*
outputs
,
int
outputsLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_node_list
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_op_node
*
ops
,
int
opsLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_const_node
)(
hexagon_nn_nn_id
id
,
unsigned
int
node_id
,
unsigned
int
batches
,
unsigned
int
height
,
unsigned
int
width
,
unsigned
int
depth
,
const
unsigned
char
*
data
,
int
dataLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_append_const_node_list
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_const_node
*
consts
,
int
constsLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_prepare
)(
hexagon_nn_nn_id
id
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_execute
)(
hexagon_nn_nn_id
id
,
unsigned
int
batches_in
,
unsigned
int
height_in
,
unsigned
int
width_in
,
unsigned
int
depth_in
,
const
unsigned
char
*
data_in
,
int
data_inLen
,
unsigned
int
*
batches_out
,
unsigned
int
*
height_out
,
unsigned
int
*
width_out
,
unsigned
int
*
depth_out
,
unsigned
char
*
data_out
,
int
data_outLen
,
unsigned
int
*
data_len_out
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_teardown
)(
hexagon_nn_nn_id
id
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_set_powersave_level
)(
unsigned
int
level
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_get_perfinfo
)(
hexagon_nn_nn_id
id
,
hexagon_nn_perfinfo
*
info_out
,
int
info_outLen
,
unsigned
int
*
n_items
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_reset_perfinfo
)(
hexagon_nn_nn_id
id
,
unsigned
int
event
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_last_execution_cycles
)(
hexagon_nn_nn_id
id
,
unsigned
int
*
cycles_lo
,
unsigned
int
*
cycles_hi
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_version
)(
int
*
ver
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_op_name_to_id
)(
const
char
*
name
,
unsigned
int
*
node_id
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_op_id_to_name
)(
unsigned
int
node_id
,
char
*
name
,
int
nameLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_disable_dcvs
)(
void
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_GetHexagonBinaryVersion
)(
int
*
ver
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_PrintLog
)(
const
unsigned
char
*
buf
,
int
bufLen
)
__QAIC_HEADER_ATTRIBUTE
;
__QAIC_HEADER_EXPORT
int
__QAIC_HEADER
(
hexagon_nn_execute_new
)(
hexagon_nn_nn_id
id
,
const
hexagon_nn_tensordef
*
inputs
,
int
inputsLen
,
hexagon_nn_tensordef
*
outputs
,
int
outputsLen
)
__QAIC_HEADER_ATTRIBUTE
;
#ifdef __cplusplus
}
#endif
...
...
mace/core/runtime/hexagon/hexagon_nn_ops.h
浏览文件 @
4410ecd2
...
...
@@ -5,8 +5,8 @@
#ifndef LIBMACE_HEXAGON_NN_OPS_H
#define LIBMACE_HEXAGON_NN_OPS_H
#include "mace/utils/logging.h"
#include <unordered_map>
#include "mace/utils/logging.h"
namespace
mace
{
...
...
@@ -24,8 +24,7 @@ typedef enum op_type_enum {
class
OpMap
{
public:
void
Init
()
{
#define DEF_OP(NAME) \
op_map_[#NAME] = OP_##NAME;
#define DEF_OP(NAME) op_map_[#NAME] = OP_##NAME;
#include "mace/core/runtime/hexagon/ops.h"
...
...
@@ -40,6 +39,7 @@ class OpMap {
return
OP_INVALID
;
}
}
private:
std
::
unordered_map
<
std
::
string
,
int
>
op_map_
;
};
...
...
mace/core/runtime/hexagon/ops.h
浏览文件 @
4410ecd2
...
...
@@ -178,4 +178,3 @@ DEF_OP(QuantizedBiasAdd_8p8to8)
#undef __SELF_DEF_OP_WREF
#undef DEF_OP_WREF
#endif
mace/core/runtime/hexagon/quantize.cc
浏览文件 @
4410ecd2
...
...
@@ -29,16 +29,16 @@ void Quantizer::Quantize(const Tensor &in_tensor,
float
*
max_out
)
{
float
stepsize
;
float
recip_stepsize
;
QuantizeAdjustRange
(
min_in
,
max_in
,
min_out
,
max_out
,
&
stepsize
,
&
recip_stepsize
);
QuantizeAdjustRange
(
min_in
,
max_in
,
min_out
,
max_out
,
&
stepsize
,
&
recip_stepsize
);
const
float
*
in
=
in_tensor
.
data
<
float
>
();
uint8_t
*
out
=
out_tensor
->
mutable_data
<
uint8_t
>
();
for
(
int
i
=
0
;
i
<
in_tensor
.
size
();
i
++
)
{
const
float
inval
=
in
[
i
];
float
ival
=
static_cast
<
uint8_t
>
((
inval
-
*
min_out
)
*
recip_stepsize
+
0.5
f
);
float
ival
=
static_cast
<
uint8_t
>
((
inval
-
*
min_out
)
*
recip_stepsize
+
0.5
f
);
if
(
ival
<
0
)
ival
=
0
;
if
(
ival
>
255
)
ival
=
255
;
out
[
i
]
=
static_cast
<
uint8_t
>
(
ival
);
...
...
mace/core/runtime/hexagon/quantize.h
浏览文件 @
4410ecd2
...
...
@@ -16,13 +16,17 @@ class Quantizer {
void
Quantize
(
const
Tensor
&
in_tensor
,
Tensor
*
out_tensor
,
float
*
min_out
,
float
*
max_out
);
float
*
min_out
,
float
*
max_out
);
void
Quantize
(
const
Tensor
&
in_tensor
,
const
float
min_in
,
const
float
max_in
,
const
float
min_in
,
const
float
max_in
,
Tensor
*
out_tensor
,
float
*
min_out
,
float
*
max_out
);
float
*
min_out
,
float
*
max_out
);
void
DeQuantize
(
const
Tensor
&
in_tensor
,
const
float
min_in
,
const
float
max_in
,
const
float
min_in
,
const
float
max_in
,
Tensor
*
out_tensor
);
private:
...
...
mace/core/runtime/opencl/opencl_allocator.cc
浏览文件 @
4410ecd2
...
...
@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace
mace
{
...
...
@@ -29,7 +29,6 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
return
0
;
}
}
}
OpenCLAllocator
::
OpenCLAllocator
()
{}
...
...
@@ -49,17 +48,16 @@ void *OpenCLAllocator::New(size_t nbytes) const {
void
*
OpenCLAllocator
::
NewImage
(
const
std
::
vector
<
size_t
>
&
image_shape
,
const
DataType
dt
)
const
{
MACE_CHECK
(
image_shape
.
size
()
==
2
)
<<
"Image shape's size must equal 2"
;
VLOG
(
3
)
<<
"Allocate OpenCL image: "
<<
image_shape
[
0
]
<<
", "
<<
image_shape
[
1
];
VLOG
(
3
)
<<
"Allocate OpenCL image: "
<<
image_shape
[
0
]
<<
", "
<<
image_shape
[
1
];
cl
::
ImageFormat
img_format
(
CL_RGBA
,
DataTypeToCLChannelType
(
dt
));
cl_int
error
;
cl
::
Image2D
*
cl_image
=
new
cl
::
Image2D
(
OpenCLRuntime
::
Global
()
->
context
(),
CL_MEM_READ_WRITE
|
CL_MEM_ALLOC_HOST_PTR
,
img_format
,
image_shape
[
0
],
image_shape
[
1
],
0
,
nullptr
,
&
error
);
CL_MEM_READ_WRITE
|
CL_MEM_ALLOC_HOST_PTR
,
img_format
,
image_shape
[
0
],
image_shape
[
1
],
0
,
nullptr
,
&
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
error
<<
" with image shape: ["
<<
image_shape
[
0
]
<<
", "
<<
image_shape
[
1
]
<<
"]"
;
...
...
@@ -89,8 +87,8 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
// TODO(heliangliang) Non-blocking call
cl_int
error
;
void
*
mapped_ptr
=
queue
.
enqueueMapBuffer
(
*
cl_buffer
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
offset
,
nbytes
,
nullptr
,
nullptr
,
&
error
);
queue
.
enqueueMapBuffer
(
*
cl_buffer
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
offset
,
nbytes
,
nullptr
,
nullptr
,
&
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
return
mapped_ptr
;
}
...
...
@@ -106,13 +104,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
mapped_image_pitch
->
resize
(
2
);
cl_int
error
;
void
*
mapped_ptr
=
OpenCLRuntime
::
Global
()
->
command_queue
().
enqueueMapImage
(
*
cl_image
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
origin
,
region
,
mapped_image_pitch
->
data
(),
mapped_image_pitch
->
data
()
+
1
,
nullptr
,
nullptr
,
&
error
);
void
*
mapped_ptr
=
OpenCLRuntime
::
Global
()
->
command_queue
().
enqueueMapImage
(
*
cl_image
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
origin
,
region
,
mapped_image_pitch
->
data
(),
mapped_image_pitch
->
data
()
+
1
,
nullptr
,
nullptr
,
&
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
error
;
return
mapped_ptr
;
...
...
mace/core/runtime/opencl/opencl_development.cc
浏览文件 @
4410ecd2
...
...
@@ -5,8 +5,8 @@
#include <vector>
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace
mace
{
...
...
@@ -16,7 +16,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl
::
Device
&
device
,
cl
::
Program
*
program
,
bool
*
is_binary
)
{
extern
const
std
::
map
<
std
::
string
,
std
::
vector
<
unsigned
char
>>
kEncryptedProgramMap
;
extern
const
std
::
map
<
std
::
string
,
std
::
vector
<
unsigned
char
>>
kEncryptedProgramMap
;
*
is_binary
=
false
;
auto
it_source
=
kEncryptedProgramMap
.
find
(
program_name
);
if
(
it_source
==
kEncryptedProgramMap
.
end
())
{
...
...
mace/core/runtime/opencl/opencl_production.cc
浏览文件 @
4410ecd2
...
...
@@ -14,7 +14,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
cl
::
Device
&
device
,
cl
::
Program
*
program
,
bool
*
is_binary
)
{
extern
const
std
::
map
<
std
::
string
,
std
::
vector
<
unsigned
char
>>
kCompiledProgramMap
;
extern
const
std
::
map
<
std
::
string
,
std
::
vector
<
unsigned
char
>>
kCompiledProgramMap
;
*
is_binary
=
true
;
auto
it_binary
=
kCompiledProgramMap
.
find
(
binary_file_name_prefix
);
if
(
it_binary
==
kCompiledProgramMap
.
end
())
{
...
...
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
4410ecd2
...
...
@@ -48,11 +48,9 @@ double OpenCLProfilingTimer::ElapsedMicros() {
return
(
stop_nanos_
-
start_nanos_
)
/
1000.0
;
}
double
OpenCLProfilingTimer
::
AccumulatedMicros
()
{
return
accumulated_micros_
;
}
double
OpenCLProfilingTimer
::
AccumulatedMicros
()
{
return
accumulated_micros_
;
}
void
OpenCLProfilingTimer
::
AccumulateTiming
(){
void
OpenCLProfilingTimer
::
AccumulateTiming
()
{
StopTiming
();
accumulated_micros_
+=
(
stop_nanos_
-
start_nanos_
)
/
1000.0
;
}
...
...
@@ -116,7 +114,8 @@ OpenCLRuntime::OpenCLRuntime() {
cl
::
CommandQueue
command_queue
(
context
,
gpu_device
,
properties
);
const
char
*
kernel_path
=
getenv
(
"MACE_KERNEL_PATH"
);
this
->
kernel_path_
=
std
::
string
(
kernel_path
==
nullptr
?
""
:
kernel_path
)
+
"/"
;
this
->
kernel_path_
=
std
::
string
(
kernel_path
==
nullptr
?
""
:
kernel_path
)
+
"/"
;
this
->
device_
=
new
cl
::
Device
(
gpu_device
);
this
->
context_
=
new
cl
::
Context
(
context
);
...
...
@@ -166,15 +165,11 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
GenerateCLBinaryFilenamePrefix
(
built_program_key
);
std
::
vector
<
unsigned
char
>
program_vec
;
bool
is_opencl_binary
;
const
bool
found
=
GetSourceOrBinaryProgram
(
program_name
,
binary_file_name_prefix
,
context
(),
device
(),
program
,
&
is_opencl_binary
);
const
bool
found
=
GetSourceOrBinaryProgram
(
program_name
,
binary_file_name_prefix
,
context
(),
device
(),
program
,
&
is_opencl_binary
);
MACE_CHECK
(
found
,
"Program not found for "
,
is_opencl_binary
?
"binary: "
:
"source: "
,
built_program_key
);
is_opencl_binary
?
"binary: "
:
"source: "
,
built_program_key
);
// Build program
std
::
string
build_options_str
=
...
...
@@ -190,13 +185,13 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
}
LOG
(
FATAL
)
<<
"Build program from "
<<
(
is_opencl_binary
?
"binary: "
:
"source: "
)
<<
built_program_key
<<
" failed: "
<<
ret
;
<<
built_program_key
<<
" failed: "
<<
ret
;
}
if
(
!
is_opencl_binary
)
{
// Write binary if necessary
std
::
string
binary_filename
=
kernel_path_
+
binary_file_name_prefix
+
".bin"
;
std
::
string
binary_filename
=
kernel_path_
+
binary_file_name_prefix
+
".bin"
;
size_t
device_list_size
=
1
;
std
::
unique_ptr
<
size_t
[]
>
program_binary_sizes
(
new
size_t
[
device_list_size
]);
...
...
@@ -240,8 +235,8 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if
(
built_program_it
!=
built_program_map_
.
end
())
{
program
=
built_program_it
->
second
;
}
else
{
this
->
BuildProgram
(
program_name
,
built_program_key
,
build_options_str
,
&
program
);
this
->
BuildProgram
(
program_name
,
built_program_key
,
build_options_str
,
&
program
);
built_program_map_
.
emplace
(
built_program_key
,
program
);
}
return
cl
::
Kernel
(
program
,
kernel_name
.
c_str
());
...
...
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
4410ecd2
...
...
@@ -19,7 +19,8 @@ namespace mace {
class
OpenCLProfilingTimer
:
public
Timer
{
public:
explicit
OpenCLProfilingTimer
(
const
cl
::
Event
*
event
)
:
event_
(
event
),
accumulated_micros_
(
0
)
{};
explicit
OpenCLProfilingTimer
(
const
cl
::
Event
*
event
)
:
event_
(
event
),
accumulated_micros_
(
0
){};
void
StartTiming
()
override
;
void
StopTiming
()
override
;
void
AccumulateTiming
()
override
;
...
...
@@ -48,6 +49,7 @@ class OpenCLRuntime {
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
private:
OpenCLRuntime
();
~
OpenCLRuntime
();
...
...
mace/core/runtime/opencl/opencl_wrapper.h
浏览文件 @
4410ecd2
...
...
@@ -7,9 +7,9 @@
namespace
mace
{
// These functions are not thread-safe.
void
LoadOpenCLLibrary
();
void
UnloadOpenCLLibrary
();
// These functions are not thread-safe.
void
LoadOpenCLLibrary
();
void
UnloadOpenCLLibrary
();
}
// namespace mace
...
...
mace/core/tensor.h
浏览文件 @
4410ecd2
...
...
@@ -69,13 +69,10 @@ class Tensor {
dtype_
(
type
),
buffer_
(
nullptr
),
is_buffer_owner_
(
true
),
name_
(
""
)
{};
name_
(
""
)
{};
Tensor
(
BufferBase
*
buffer
,
DataType
dtype
)
:
dtype_
(
dtype
),
buffer_
(
buffer
),
is_buffer_owner_
(
false
),
name_
(
""
)
{}
:
dtype_
(
dtype
),
buffer_
(
buffer
),
is_buffer_owner_
(
false
),
name_
(
""
)
{}
Tensor
(
const
BufferSlice
&
buffer_slice
,
DataType
dtype
)
:
dtype_
(
dtype
),
...
...
@@ -102,8 +99,8 @@ class Tensor {
inline
index_t
dim_size
()
const
{
return
shape_
.
size
();
}
inline
index_t
dim
(
unsigned
int
index
)
const
{
MACE_CHECK
(
index
<
shape_
.
size
(),
"Dim out of range: "
,
index
,
" >= "
,
shape_
.
size
());
MACE_CHECK
(
index
<
shape_
.
size
(),
"Dim out of range: "
,
index
,
" >= "
,
shape_
.
size
());
return
shape_
[
index
];
}
...
...
@@ -112,40 +109,35 @@ class Tensor {
std
::
multiplies
<
int64_t
>
());
}
inline
index_t
raw_size
()
const
{
return
size
()
*
SizeOfType
();
}
inline
index_t
raw_size
()
const
{
return
size
()
*
SizeOfType
();
}
inline
bool
has_opencl_image
()
const
{
return
buffer_
!=
nullptr
&&
!
buffer_
->
OnHost
()
&&
typeid
(
*
buffer_
)
==
typeid
(
Image
);
return
buffer_
!=
nullptr
&&
!
buffer_
->
OnHost
()
&&
typeid
(
*
buffer_
)
==
typeid
(
Image
);
}
inline
bool
has_opencl_buffer
()
const
{
return
buffer_
!=
nullptr
&&
!
buffer_
->
OnHost
()
&&
!
has_opencl_image
();
return
buffer_
!=
nullptr
&&
!
buffer_
->
OnHost
()
&&
!
has_opencl_image
();
}
inline
cl
::
Image
*
opencl_image
()
const
{
MACE_CHECK
(
has_opencl_image
(),
"do not have image"
);
return
static_cast
<
cl
::
Image
*>
(
buffer_
->
buffer
());
return
static_cast
<
cl
::
Image
*>
(
buffer_
->
buffer
());
}
inline
cl
::
Buffer
*
opencl_buffer
()
const
{
MACE_CHECK
(
has_opencl_buffer
(),
"do not have opencl buffer"
);
return
static_cast
<
cl
::
Buffer
*>
(
buffer_
->
buffer
());
return
static_cast
<
cl
::
Buffer
*>
(
buffer_
->
buffer
());
}
inline
index_t
buffer_offset
()
const
{
return
buffer_
->
offset
();
}
inline
index_t
buffer_offset
()
const
{
return
buffer_
->
offset
();
}
inline
const
void
*
raw_data
()
const
{
MACE_CHECK
(
buffer_
!=
nullptr
,
"buffer is null"
);
return
buffer_
->
raw_data
();
}
template
<
typename
T
>
template
<
typename
T
>
inline
const
T
*
data
()
const
{
MACE_CHECK
(
buffer_
!=
nullptr
,
"buffer is null"
);
return
buffer_
->
data
<
T
>
();
...
...
@@ -156,7 +148,7 @@ class Tensor {
return
buffer_
->
raw_mutable_data
();
}
template
<
typename
T
>
template
<
typename
T
>
inline
T
*
mutable_data
()
{
MACE_CHECK
(
buffer_
!=
nullptr
,
"buffer is null"
);
return
static_cast
<
T
*>
(
buffer_
->
raw_mutable_data
());
...
...
@@ -188,25 +180,17 @@ class Tensor {
is_buffer_owner_
=
true
;
}
else
{
MACE_CHECK
(
has_opencl_image
(),
"Cannot ResizeImage buffer, use Resize."
);
Image
*
image
=
dynamic_cast
<
Image
*>
(
buffer_
);
MACE_CHECK
(
image_shape
[
0
]
<=
image
->
image_shape
()[
0
]
&&
image_shape
[
1
]
<=
image
->
image_shape
()[
1
],
"tensor (source op "
,
name_
,
"): current physical image shape: "
,
image
->
image_shape
()[
0
],
", "
,
image
->
image_shape
()[
1
],
" < logical image shape: "
,
image_shape
[
0
],
", "
,
image_shape
[
1
]);
Image
*
image
=
dynamic_cast
<
Image
*>
(
buffer_
);
MACE_CHECK
(
image_shape
[
0
]
<=
image
->
image_shape
()[
0
]
&&
image_shape
[
1
]
<=
image
->
image_shape
()[
1
],
"tensor (source op "
,
name_
,
"): current physical image shape: "
,
image
->
image_shape
()[
0
],
", "
,
image
->
image_shape
()[
1
],
" < logical image shape: "
,
image_shape
[
0
],
", "
,
image_shape
[
1
]);
}
}
inline
void
ResizeLike
(
const
Tensor
&
other
)
{
ResizeLike
(
&
other
);
}
inline
void
ResizeLike
(
const
Tensor
&
other
)
{
ResizeLike
(
&
other
);
}
inline
void
ResizeLike
(
const
Tensor
*
other
)
{
if
(
other
->
has_opencl_image
())
{
...
...
@@ -229,7 +213,7 @@ class Tensor {
memcpy
(
buffer_
->
raw_mutable_data
(),
src
,
size
);
}
template
<
typename
T
>
template
<
typename
T
>
inline
void
Copy
(
const
T
*
src
,
index_t
length
)
{
MACE_CHECK
(
length
==
size
(),
"copy src and dst with different size."
);
CopyBytes
(
static_cast
<
const
void
*>
(
src
),
sizeof
(
T
)
*
length
);
...
...
@@ -248,13 +232,9 @@ class Tensor {
return
type_size
;
}
inline
BufferBase
*
UnderlyingBuffer
()
const
{
return
buffer_
;
}
inline
BufferBase
*
UnderlyingBuffer
()
const
{
return
buffer_
;
}
inline
void
SetSourceOpName
(
const
std
::
string
name
)
{
name_
=
name
;
}
inline
void
SetSourceOpName
(
const
std
::
string
name
)
{
name_
=
name
;
}
inline
void
DebugPrint
()
const
{
using
namespace
numerical_chars
;
...
...
@@ -272,8 +252,9 @@ class Tensor {
}
CASES
(
dtype_
,
(
os
<<
(
this
->
data
<
T
>
()[
i
])
<<
", "
));
}
LOG
(
INFO
)
<<
"Tensor size: ["
<<
dim
(
0
)
<<
", "
<<
dim
(
1
)
<<
", "
<<
dim
(
2
)
<<
", "
<<
dim
(
3
)
<<
"], content:
\n
"
<<
os
.
str
();
LOG
(
INFO
)
<<
"Tensor size: ["
<<
dim
(
0
)
<<
", "
<<
dim
(
1
)
<<
", "
<<
dim
(
2
)
<<
", "
<<
dim
(
3
)
<<
"], content:
\n
"
<<
os
.
str
();
}
class
MappingGuard
{
...
...
@@ -308,7 +289,7 @@ class Tensor {
Allocator
*
allocator_
;
DataType
dtype_
;
std
::
vector
<
index_t
>
shape_
;
std
::
vector
<
size_t
>
image_shape_
;
std
::
vector
<
size_t
>
image_shape_
;
BufferBase
*
buffer_
;
BufferSlice
buffer_slice_
;
bool
is_buffer_owner_
;
...
...
mace/core/testing/test_benchmark.cc
浏览文件 @
4410ecd2
...
...
@@ -99,9 +99,7 @@ void RestartTiming() {
accum_time
=
0
;
start_time
=
NowMicros
();
}
void
StartTiming
()
{
start_time
=
NowMicros
();
}
void
StartTiming
()
{
start_time
=
NowMicros
();
}
void
StopTiming
()
{
if
(
start_time
!=
0
)
{
accum_time
+=
(
NowMicros
()
-
start_time
);
...
...
mace/core/testing/test_benchmark.h
浏览文件 @
4410ecd2
...
...
@@ -6,9 +6,9 @@
#ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
#define MACE_CORE_TESTING_TEST_BENCHMARK_H_
#include <string>
#include <utility>
#include <vector>
#include <string>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \
...
...
mace/core/types.cc
浏览文件 @
4410ecd2
...
...
@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <map>
#include <cstdint>
#include <map>
#include "mace/core/types.h"
#include "mace/utils/logging.h"
...
...
@@ -30,18 +30,12 @@ bool DataTypeCanUseMemcpy(DataType dt) {
std
::
string
DataTypeToString
(
const
DataType
dt
)
{
static
std
::
map
<
DataType
,
std
::
string
>
dtype_string_map
=
{
{
DT_FLOAT
,
"DT_FLOAT"
},
{
DT_HALF
,
"DT_HALF"
},
{
DT_DOUBLE
,
"DT_DOUBLE"
},
{
DT_UINT8
,
"DT_UINT8"
},
{
DT_INT8
,
"DT_INT8"
},
{
DT_INT32
,
"DT_INT32"
},
{
DT_UINT32
,
"DT_UINT32"
},
{
DT_UINT16
,
"DT_UINT16"
},
{
DT_INT64
,
"DT_INT64"
},
{
DT_BOOL
,
"DT_BOOL"
},
{
DT_STRING
,
"DT_STRING"
}
};
{
DT_FLOAT
,
"DT_FLOAT"
},
{
DT_HALF
,
"DT_HALF"
},
{
DT_DOUBLE
,
"DT_DOUBLE"
},
{
DT_UINT8
,
"DT_UINT8"
},
{
DT_INT8
,
"DT_INT8"
},
{
DT_INT32
,
"DT_INT32"
},
{
DT_UINT32
,
"DT_UINT32"
},
{
DT_UINT16
,
"DT_UINT16"
},
{
DT_INT64
,
"DT_INT64"
},
{
DT_BOOL
,
"DT_BOOL"
},
{
DT_STRING
,
"DT_STRING"
}};
MACE_CHECK
(
dt
!=
DT_INVALID
)
<<
"Not support Invalid data type"
;
return
dtype_string_map
[
dt
];
}
...
...
mace/core/workspace.cc
浏览文件 @
4410ecd2
...
...
@@ -5,8 +5,8 @@
#include <string>
#include <vector>
#include "mace/core/workspace.h"
#include "mace/core/arg_helper.h"
#include "mace/core/workspace.h"
#include "mace/utils/timer.h"
namespace
mace
{
...
...
@@ -51,19 +51,19 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
index_t
model_data_size
=
0
;
unsigned
char
*
model_data_ptr
=
nullptr
;
for
(
auto
&
const_tensor
:
net_def
.
tensors
())
{
if
(
model_data_ptr
==
nullptr
||
reinterpret_cast
<
long
long
>
(
const_tensor
.
data
())
<
reinterpret_cast
<
long
long
>
(
model_data_ptr
))
{
if
(
model_data_ptr
==
nullptr
||
reinterpret_cast
<
long
long
>
(
const_tensor
.
data
())
<
reinterpret_cast
<
long
long
>
(
model_data_ptr
))
{
model_data_ptr
=
const_cast
<
unsigned
char
*>
(
const_tensor
.
data
());
}
}
for
(
auto
&
const_tensor
:
net_def
.
tensors
())
{
model_data_size
=
std
::
max
(
model_data_size
,
static_cast
<
index_t
>
(
(
reinterpret_cast
<
long
long
>
(
const_tensor
.
data
())
-
reinterpret_cast
<
long
long
>
(
model_data_ptr
))
+
const_tensor
.
data_size
()
*
GetEnumTypeSize
(
const_tensor
.
data_type
())));
model_data_size
=
std
::
max
(
model_data_size
,
static_cast
<
index_t
>
((
reinterpret_cast
<
long
long
>
(
const_tensor
.
data
())
-
reinterpret_cast
<
long
long
>
(
model_data_ptr
))
+
const_tensor
.
data_size
()
*
GetEnumTypeSize
(
const_tensor
.
data_type
())));
}
VLOG
(
3
)
<<
"Model data size: "
<<
model_data_size
;
...
...
@@ -81,8 +81,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
for
(
auto
&
const_tensor
:
net_def
.
tensors
())
{
MACE_LATENCY_LOGGER
(
2
,
"Load tensor "
,
const_tensor
.
name
());
VLOG
(
3
)
<<
"Tensor name: "
<<
const_tensor
.
name
()
<<
", data type: "
<<
const_tensor
.
data_type
()
<<
", shape: "
<<
", data type: "
<<
const_tensor
.
data_type
()
<<
", shape: "
<<
MakeString
(
std
::
vector
<
index_t
>
(
const_tensor
.
dims
().
begin
(),
const_tensor
.
dims
().
end
()));
std
::
vector
<
index_t
>
dims
;
...
...
@@ -90,13 +89,11 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
dims
.
push_back
(
d
);
}
index_t
offset
=
(
long
long
)
const_tensor
.
data
()
-
(
long
long
)
model_data_ptr
;
index_t
offset
=
(
long
long
)
const_tensor
.
data
()
-
(
long
long
)
model_data_ptr
;
std
::
unique_ptr
<
Tensor
>
tensor
(
new
Tensor
(
BufferSlice
(
tensor_buffer_
.
get
(),
offset
,
const_tensor
.
data_size
()
*
GetEnumTypeSize
(
const_tensor
.
data_type
())),
new
Tensor
(
BufferSlice
(
tensor_buffer_
.
get
(),
offset
,
const_tensor
.
data_size
()
*
GetEnumTypeSize
(
const_tensor
.
data_type
())),
const_tensor
.
data_type
()));
tensor
->
Reshape
(
dims
);
...
...
@@ -118,13 +115,11 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
// as GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op,
// we stick to the same concept.
for
(
auto
&
op
:
net_def
.
op
())
{
for
(
auto
&
op
:
net_def
.
op
())
{
if
(
op
.
has_mem_id
())
{
const
DataType
op_dtype
=
static_cast
<
DataType
>
(
ArgumentHelper
::
GetSingleArgument
<
OperatorDef
,
int
>
(
op
,
"T"
,
static_cast
<
int
>
(
DT_FLOAT
)));
op
,
"T"
,
static_cast
<
int
>
(
DT_FLOAT
)));
if
(
op_dtype
!=
DataType
::
DT_INVALID
)
{
dtype
=
op_dtype
;
// find first valid data type, break
...
...
@@ -133,22 +128,24 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
}
}
MACE_CHECK
(
dtype
!=
DataType
::
DT_INVALID
,
"data type is invalid."
);
for
(
auto
&
mem_block
:
net_def
.
mem_arena
().
mem_block
())
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
({
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
for
(
auto
&
mem_block
:
net_def
.
mem_arena
().
mem_block
())
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
({
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
image_buf
));
}
VLOG
(
3
)
<<
"Preallocate image to tensors"
;
for
(
auto
&
op
:
net_def
.
op
())
{
for
(
auto
&
op
:
net_def
.
op
())
{
if
(
op
.
has_mem_id
())
{
std
::
unique_ptr
<
Tensor
>
tensor
(
new
Tensor
(
preallocated_allocator_
.
GetBuffer
(
op
.
mem_id
()),
dtype
));
std
::
unique_ptr
<
Tensor
>
tensor
(
new
Tensor
(
preallocated_allocator_
.
GetBuffer
(
op
.
mem_id
()),
dtype
));
tensor
->
SetSourceOpName
(
op
.
name
());
VLOG
(
3
)
<<
"Tensor: "
<<
op
.
name
()
<<
"("
<<
op
.
type
()
<<
")"
<<
"; Mem: "
<<
op
.
mem_id
()
<<
"; Image shape: "
VLOG
(
3
)
<<
"Tensor: "
<<
op
.
name
()
<<
"("
<<
op
.
type
()
<<
")"
<<
"; Mem: "
<<
op
.
mem_id
()
<<
"; Image shape: "
<<
dynamic_cast
<
Image
*>
(
tensor
->
UnderlyingBuffer
())
->
image_shape
()[
0
]
<<
", "
<<
dynamic_cast
<
Image
*>
(
tensor
->
UnderlyingBuffer
())
->
image_shape
()[
1
];
<<
dynamic_cast
<
Image
*>
(
tensor
->
UnderlyingBuffer
())
->
image_shape
()[
1
];
tensor_map_
[
op
.
output
(
0
)]
=
std
::
move
(
tensor
);
}
}
...
...
mace/core/workspace.h
浏览文件 @
4410ecd2
...
...
@@ -5,9 +5,9 @@
#ifndef MACE_CORE_WORKSPACE_H_
#define MACE_CORE_WORKSPACE_H_
#include "mace/core/preallocated_pooled_allocator.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/preallocated_pooled_allocator.h"
namespace
mace
{
...
...
mace/kernels/activation.h
浏览文件 @
4410ecd2
...
...
@@ -6,9 +6,9 @@
#define MACE_KERNELS_ACTIVATION_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -99,15 +99,13 @@ void PReLUActivation(const T *input_ptr,
output_ptr
[
i
]
=
in
;
}
}
}
template
<
DeviceType
D
,
typename
T
>
class
ActivationFunctor
{
public:
ActivationFunctor
(
ActivationType
type
,
T
relux_max_limit
)
:
activation_
(
type
),
relux_max_limit_
(
relux_max_limit
){}
:
activation_
(
type
),
relux_max_limit_
(
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
alpha
,
...
...
@@ -118,9 +116,11 @@ class ActivationFunctor {
if
(
activation_
==
PRELU
)
{
MACE_CHECK_NOTNULL
(
alpha
);
const
T
*
alpha_ptr
=
alpha
->
data
<
T
>
();
PReLUActivation
(
input_ptr
,
output
->
size
(),
input
->
dim
(
3
),
alpha_ptr
,
output_ptr
);
PReLUActivation
(
input_ptr
,
output
->
size
(),
input
->
dim
(
3
),
alpha_ptr
,
output_ptr
);
}
else
{
DoActivation
(
input_ptr
,
output_ptr
,
output
->
size
(),
activation_
,
relux_max_limit_
);
DoActivation
(
input_ptr
,
output_ptr
,
output
->
size
(),
activation_
,
relux_max_limit_
);
}
}
...
...
@@ -131,14 +131,16 @@ class ActivationFunctor {
template
<
>
void
ActivationFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
alpha
,
Tensor
*
output
,
StatsFuture
*
future
);
const
Tensor
*
input
,
const
Tensor
*
alpha
,
Tensor
*
output
,
StatsFuture
*
future
);
template
<
typename
T
>
class
ActivationFunctor
<
DeviceType
::
OPENCL
,
T
>
{
public:
ActivationFunctor
(
ActivationType
type
,
T
relux_max_limit
)
:
activation_
(
type
),
relux_max_limit_
(
relux_max_limit
){}
:
activation_
(
type
),
relux_max_limit_
(
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
alpha
,
...
...
mace/kernels/addn.h
浏览文件 @
4410ecd2
...
...
@@ -18,7 +18,7 @@ namespace mace {
namespace
kernels
{
namespace
{
constexpr
int
kCostPerGroup
=
1024
;
constexpr
int
kCostPerGroup
=
1024
;
}
// namespace
template
<
DeviceType
D
,
typename
T
>
...
...
mace/kernels/batch_norm.h
浏览文件 @
4410ecd2
...
...
@@ -10,10 +10,10 @@
#endif
#include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
#include "mace/public/mace.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -24,7 +24,7 @@ struct BatchNormFunctorBase {
const
float
relux_max_limit
)
:
folded_constant_
(
folded_constant
),
activation_
(
activation
),
relux_max_limit_
(
relux_max_limit
){}
relux_max_limit_
(
relux_max_limit
)
{}
const
bool
folded_constant_
;
const
ActivationType
activation_
;
...
...
@@ -36,8 +36,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
BatchNormFunctor
(
const
bool
folded_constant
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
:
BatchNormFunctorBase
(
folded_constant
,
activation
,
relux_max_limit
)
{}
:
BatchNormFunctorBase
(
folded_constant
,
activation
,
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
scale
,
...
...
@@ -147,8 +146,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
BatchNormFunctor
(
const
bool
folded_constant
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
:
BatchNormFunctorBase
(
folded_constant
,
activation
,
relux_max_limit
)
{}
:
BatchNormFunctorBase
(
folded_constant
,
activation
,
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
scale
,
const
Tensor
*
offset
,
...
...
mace/kernels/bias_add.h
浏览文件 @
4410ecd2
...
...
@@ -6,9 +6,9 @@
#define MACE_KERNELS_BIAS_ADD_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -32,7 +32,6 @@ struct BiasAddFunctor {
const
T
*
bias_ptr
=
bias
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
#pragma omp parallel for collapse(4)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
...
...
@@ -44,7 +43,6 @@ struct BiasAddFunctor {
}
}
}
}
};
...
...
mace/kernels/buffer_to_image.h
浏览文件 @
4410ecd2
...
...
@@ -17,10 +17,9 @@ struct BufferToImageFunctorBase {
bool
i2b_
;
};
template
<
DeviceType
D
,
typename
T
>
struct
BufferToImageFunctor
:
BufferToImageFunctorBase
{
BufferToImageFunctor
(
bool
i2b
=
false
)
:
BufferToImageFunctorBase
(
i2b
)
{}
template
<
DeviceType
D
,
typename
T
>
struct
BufferToImageFunctor
:
BufferToImageFunctorBase
{
BufferToImageFunctor
(
bool
i2b
=
false
)
:
BufferToImageFunctorBase
(
i2b
)
{}
void
operator
()(
Tensor
*
input
,
const
BufferType
type
,
Tensor
*
output
,
...
...
@@ -29,10 +28,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
}
};
template
<
typename
T
>
struct
BufferToImageFunctor
<
DeviceType
::
OPENCL
,
T
>
:
BufferToImageFunctorBase
{
BufferToImageFunctor
(
bool
i2b
=
false
)
:
BufferToImageFunctorBase
(
i2b
)
{}
template
<
typename
T
>
struct
BufferToImageFunctor
<
DeviceType
::
OPENCL
,
T
>
:
BufferToImageFunctorBase
{
BufferToImageFunctor
(
bool
i2b
=
false
)
:
BufferToImageFunctorBase
(
i2b
)
{}
void
operator
()(
Tensor
*
input
,
const
BufferType
type
,
Tensor
*
output
,
...
...
mace/kernels/channel_shuffle.h
浏览文件 @
4410ecd2
...
...
@@ -16,8 +16,10 @@ class ChannelShuffleFunctor {
public:
ChannelShuffleFunctor
(
const
int
group
)
:
group_
(
group
)
{}
void
operator
()(
const
T
*
input
,
const
index_t
*
input_shape
,
T
*
output
,
StatsFuture
*
future
)
{
void
operator
()(
const
T
*
input
,
const
index_t
*
input_shape
,
T
*
output
,
StatsFuture
*
future
)
{
index_t
batch
=
input_shape
[
0
];
index_t
channels
=
input_shape
[
1
];
index_t
height
=
input_shape
[
2
];
...
...
mace/kernels/concat.h
浏览文件 @
4410ecd2
...
...
@@ -6,23 +6,23 @@
#define MACE_KERNELS_CONCAT_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace
mace
{
namespace
kernels
{
struct
ConcatFunctorBase
{
ConcatFunctorBase
(
const
int32_t
axis
)
:
axis_
(
axis
)
{}
ConcatFunctorBase
(
const
int32_t
axis
)
:
axis_
(
axis
)
{}
int32_t
axis_
;
};
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
ConcatFunctor
:
ConcatFunctorBase
{
ConcatFunctor
(
const
int32_t
axis
)
:
ConcatFunctorBase
(
axis
)
{}
ConcatFunctor
(
const
int32_t
axis
)
:
ConcatFunctorBase
(
axis
)
{}
void
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_list
,
Tensor
*
output
,
...
...
@@ -75,14 +75,14 @@ struct ConcatFunctor : ConcatFunctorBase {
}
};
template
<
typename
T
>
struct
ConcatFunctor
<
DeviceType
::
OPENCL
,
T
>
:
ConcatFunctorBase
{
ConcatFunctor
(
const
int32_t
axis
)
:
ConcatFunctorBase
(
axis
)
{}
template
<
typename
T
>
struct
ConcatFunctor
<
DeviceType
::
OPENCL
,
T
>
:
ConcatFunctorBase
{
ConcatFunctor
(
const
int32_t
axis
)
:
ConcatFunctorBase
(
axis
)
{}
void
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_list
,
Tensor
*
output
,
StatsFuture
*
future
);
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
};
}
// namepsace kernels
...
...
mace/kernels/conv_2d.h
浏览文件 @
4410ecd2
...
...
@@ -116,8 +116,7 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
sum
[
sum_idx
]
+=
vaddvq_f32
(
tmp
);
#else
for
(
int
inci
=
0
;
inci
<
inc_tile_size
;
++
inci
)
{
sum
[
sum_idx
]
+=
in
[
in_idx
*
inc_tile_size
+
inci
]
*
sum
[
sum_idx
]
+=
in
[
in_idx
*
inc_tile_size
+
inci
]
*
weights
[
weights_idx
*
inc_tile_size
+
inci
];
}
#endif
...
...
@@ -188,7 +187,7 @@ struct Conv2dFunctorBase {
paddings_
(
paddings
),
dilations_
(
dilations
),
activation_
(
activation
),
relux_max_limit_
(
relux_max_limit
){}
relux_max_limit_
(
relux_max_limit
)
{}
const
int
*
strides_
;
// [stride_h, stride_w]
const
Padding
padding_type_
;
...
...
@@ -230,8 +229,9 @@ struct Conv2dFunctor : Conv2dFunctorBase {
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
CalcOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
FLOOR
,
output_shape
.
data
());
CalcOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
FLOOR
,
output_shape
.
data
());
}
output
->
Resize
(
output_shape
);
...
...
mace/kernels/conv_pool_2d_util.cc
浏览文件 @
4410ecd2
...
...
@@ -159,18 +159,29 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
*/
output_shape
[
0
]
=
input_shape
[
0
];
if
(
round_type
==
FLOOR
)
{
output_shape
[
1
]
=
static_cast
<
index_t
>
(
std
::
floor
(
1.0
*
(
input_shape
[
1
]
+
padding_size
[
0
]
-
filter_shape
[
0
]
-
(
filter_shape
[
0
]
-
1
)
*
(
dilations
[
0
]
-
1
))
/
strides
[
0
])
+
1
);
output_shape
[
2
]
=
static_cast
<
index_t
>
(
std
::
floor
(
1.0
*
(
input_shape
[
2
]
+
padding_size
[
1
]
-
filter_shape
[
1
]
-
(
filter_shape
[
1
]
-
1
)
*
(
dilations
[
1
]
-
1
))
/
strides
[
1
])
+
1
);
output_shape
[
1
]
=
static_cast
<
index_t
>
(
std
::
floor
(
1.0
*
(
input_shape
[
1
]
+
padding_size
[
0
]
-
filter_shape
[
0
]
-
(
filter_shape
[
0
]
-
1
)
*
(
dilations
[
0
]
-
1
))
/
strides
[
0
])
+
1
);
output_shape
[
2
]
=
static_cast
<
index_t
>
(
std
::
floor
(
1.0
*
(
input_shape
[
2
]
+
padding_size
[
1
]
-
filter_shape
[
1
]
-
(
filter_shape
[
1
]
-
1
)
*
(
dilations
[
1
]
-
1
))
/
strides
[
1
])
+
1
);
}
else
{
output_shape
[
1
]
=
static_cast
<
index_t
>
(
std
::
ceil
(
1.0
*
(
input_shape
[
1
]
+
padding_size
[
0
]
-
filter_shape
[
0
]
-
(
filter_shape
[
0
]
-
1
)
*
(
dilations
[
0
]
-
1
))
/
strides
[
0
])
+
1
);
output_shape
[
2
]
=
static_cast
<
index_t
>
(
std
::
ceil
(
1.0
*
(
input_shape
[
2
]
+
padding_size
[
1
]
-
filter_shape
[
1
]
-
(
filter_shape
[
1
]
-
1
)
*
(
dilations
[
1
]
-
1
))
/
strides
[
1
])
+
1
);
output_shape
[
1
]
=
static_cast
<
index_t
>
(
std
::
ceil
(
1.0
*
(
input_shape
[
1
]
+
padding_size
[
0
]
-
filter_shape
[
0
]
-
(
filter_shape
[
0
]
-
1
)
*
(
dilations
[
0
]
-
1
))
/
strides
[
0
])
+
1
);
output_shape
[
2
]
=
static_cast
<
index_t
>
(
std
::
ceil
(
1.0
*
(
input_shape
[
2
]
+
padding_size
[
1
]
-
filter_shape
[
1
]
-
(
filter_shape
[
1
]
-
1
)
*
(
dilations
[
1
]
-
1
))
/
strides
[
1
])
+
1
);
}
output_shape
[
3
]
=
filter_shape
[
2
];
}
void
CalPaddingSize
(
const
index_t
*
input_shape
,
// NCHW
...
...
mace/kernels/conv_pool_2d_util.h
浏览文件 @
4410ecd2
...
...
@@ -15,7 +15,7 @@ enum Padding {
FULL
=
2
,
// Pads with one less than the filter size on both sides
};
enum
RoundType
{
enum
RoundType
{
FLOOR
=
0
,
CEIL
=
1
,
};
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
4410ecd2
...
...
@@ -10,9 +10,9 @@
#endif
#include "mace/core/future.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/public/mace.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -247,7 +247,7 @@ struct DepthwiseConv2dFunctorBase {
paddings_
(
paddings
),
dilations_
(
dilations
),
activation_
(
activation
),
relux_max_limit_
(
relux_max_limit
){}
relux_max_limit_
(
relux_max_limit
)
{}
const
int
*
strides_
;
// [stride_h, stride_w]
const
Padding
padding_type_
;
...
...
@@ -296,8 +296,9 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
CalcOutputSize
(
input
->
shape
().
data
(),
fake_filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
FLOOR
,
output_shape
.
data
());
CalcOutputSize
(
input
->
shape
().
data
(),
fake_filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
FLOOR
,
output_shape
.
data
());
}
auto
input_shape
=
fake_filter_shape
;
output
->
Resize
(
output_shape
);
...
...
mace/kernels/eltwise.h
浏览文件 @
4410ecd2
...
...
@@ -5,13 +5,13 @@
#define MACE_KERNELS_ELTWISE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace
mace
{
namespace
kernels
{
enum
EltwiseType
{
enum
EltwiseType
{
PROD
=
0
,
SUM
=
1
,
MAX
=
2
,
...
...
@@ -19,8 +19,7 @@ enum EltwiseType{
};
struct
EltwiseFunctorBase
{
EltwiseFunctorBase
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
)
EltwiseFunctorBase
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
)
:
type_
(
type
),
coeff_
(
coeff
)
{}
EltwiseType
type_
;
...
...
@@ -29,8 +28,7 @@ struct EltwiseFunctorBase {
template
<
DeviceType
D
,
typename
T
>
struct
EltwiseFunctor
:
EltwiseFunctorBase
{
EltwiseFunctor
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
)
EltwiseFunctor
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
)
:
EltwiseFunctorBase
(
type
,
coeff
)
{}
void
operator
()(
const
Tensor
*
input0
,
...
...
@@ -49,7 +47,7 @@ struct EltwiseFunctor : EltwiseFunctorBase {
switch
(
type_
)
{
case
PROD
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
input0_ptr
[
i
]
*
input1_ptr
[
i
];
}
break
;
...
...
@@ -62,19 +60,20 @@ struct EltwiseFunctor : EltwiseFunctorBase {
}
else
{
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
coeff_
[
0
]
*
input0_ptr
[
i
]
+
coeff_
[
1
]
*
input1_ptr
[
i
];
output_ptr
[
i
]
=
coeff_
[
0
]
*
input0_ptr
[
i
]
+
coeff_
[
1
]
*
input1_ptr
[
i
];
}
}
break
;
case
MAX
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
max
<
T
>
(
input0_ptr
[
i
],
input1_ptr
[
i
]);
}
break
;
case
MIN
:
#pragma omp parallel for
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
size
;
++
i
)
{
output_ptr
[
i
]
=
std
::
min
<
T
>
(
input0_ptr
[
i
],
input1_ptr
[
i
]);
}
break
;
...
...
@@ -84,11 +83,9 @@ struct EltwiseFunctor : EltwiseFunctorBase {
}
};
template
<
typename
T
>
struct
EltwiseFunctor
<
DeviceType
::
OPENCL
,
T
>:
EltwiseFunctorBase
{
EltwiseFunctor
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
)
struct
EltwiseFunctor
<
DeviceType
::
OPENCL
,
T
>
:
EltwiseFunctorBase
{
EltwiseFunctor
(
const
EltwiseType
type
,
const
std
::
vector
<
float
>
&
coeff
)
:
EltwiseFunctorBase
(
type
,
coeff
)
{}
void
operator
()(
const
Tensor
*
input0
,
...
...
mace/kernels/fully_connected.h
浏览文件 @
4410ecd2
...
...
@@ -6,8 +6,8 @@
#define MACE_KERNELS_FULLY_CONNECTED_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
namespace
mace
{
...
...
@@ -16,25 +16,23 @@ namespace kernels {
struct
FullyConnectedBase
{
FullyConnectedBase
(
const
ActivationType
activation
,
const
float
relux_max_limit
)
:
activation_
(
activation
),
relux_max_limit_
(
relux_max_limit
){}
:
activation_
(
activation
),
relux_max_limit_
(
relux_max_limit
)
{}
const
ActivationType
activation_
;
const
float
relux_max_limit_
;
};
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
FullyConnectedFunctor
:
FullyConnectedBase
{
FullyConnectedFunctor
(
const
ActivationType
activation
,
const
float
relux_max_limit
)
:
FullyConnectedBase
(
activation
,
relux_max_limit
)
{}
const
float
relux_max_limit
)
:
FullyConnectedBase
(
activation
,
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
=
{
input
->
dim
(
0
),
1
,
1
,
weight
->
dim
(
0
)};
output
->
Resize
(
output_shape
);
const
index_t
N
=
output
->
dim
(
0
);
...
...
@@ -70,11 +68,11 @@ struct FullyConnectedFunctor : FullyConnectedBase {
}
};
template
<
typename
T
>
template
<
typename
T
>
struct
FullyConnectedFunctor
<
DeviceType
::
OPENCL
,
T
>
:
FullyConnectedBase
{
FullyConnectedFunctor
(
const
ActivationType
activation
,
const
float
relux_max_limit
)
:
FullyConnectedBase
(
activation
,
relux_max_limit
)
{}
const
float
relux_max_limit
)
:
FullyConnectedBase
(
activation
,
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
weight
,
...
...
mace/kernels/global_avg_pooling.h
浏览文件 @
4410ecd2
...
...
@@ -39,8 +39,10 @@ struct GlobalAvgPoolingFunctor {
template
<
>
void
GlobalAvgPoolingFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
float
*
input
,
const
index_t
*
input_shape
,
float
*
output
,
StatsFuture
*
future
);
const
float
*
input
,
const
index_t
*
input_shape
,
float
*
output
,
StatsFuture
*
future
);
}
// namespace kernels
}
// namespace mace
...
...
mace/kernels/matmul.h
浏览文件 @
4410ecd2
...
...
@@ -6,20 +6,18 @@
#define MACE_KERNELS_MATMUL_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace
mace
{
namespace
kernels
{
template
<
DeviceType
D
,
typename
T
>
struct
MatMulFunctor
{
void
operator
()(
const
Tensor
*
A
,
const
Tensor
*
B
,
Tensor
*
C
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
c_shape
=
{
A
->
dim
(
0
),
A
->
dim
(
1
),
B
->
dim
(
2
),
1
};
C
->
Resize
(
c_shape
);
const
index_t
N
=
C
->
dim
(
0
);
...
...
@@ -52,7 +50,6 @@ struct MatMulFunctor {
}
};
template
<
typename
T
>
struct
MatMulFunctor
<
DeviceType
::
OPENCL
,
T
>
{
void
operator
()(
const
Tensor
*
A
,
...
...
mace/kernels/neon/batch_norm_neon.cc
浏览文件 @
4410ecd2
...
...
@@ -52,7 +52,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
n
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
sample_size
;
++
j
)
{
const
float
*
input_sample_ptr
=
input_ptr
+
(
i
*
sample_size
+
j
)
*
channel
;
const
float
*
input_sample_ptr
=
input_ptr
+
(
i
*
sample_size
+
j
)
*
channel
;
float
*
output_sample_ptr
=
output_ptr
+
(
i
*
sample_size
+
j
)
*
channel
;
const
float
*
new_scale_ptr
=
new_scale
.
data
();
const
float
*
new_offset_ptr
=
new_offset
.
data
();
...
...
mace/kernels/neon/conv_2d_neon.cc
浏览文件 @
4410ecd2
...
...
@@ -50,12 +50,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL
(
filter
);
MACE_CHECK_NOTNULL
(
output
);
std
::
vector
<
index_t
>
output_shape_vec
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
kernels
::
CalcPaddingAndOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
dilations_
,
strides_
,
paddings_
,
output_shape_vec
.
data
(),
paddings
.
data
());
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
dilations_
,
strides_
,
paddings_
,
output_shape_vec
.
data
(),
paddings
.
data
());
output
->
Resize
(
output_shape_vec
);
typedef
void
(
*
Conv2dNeonFunction
)(
...
...
@@ -102,8 +101,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
auto
output_shape
=
output
->
shape
().
data
();
auto
conv2d_neon_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
conv2d_neon_func
(
input_data
,
input_shape
,
filter_data
,
nullptr
,
bias_data
,
output_data
,
output_shape
);
conv2d_neon_func
(
input_data
,
input_shape
,
filter_data
,
nullptr
,
bias_data
,
output_data
,
output_shape
);
}
}
// namespace kernels
...
...
mace/kernels/neon/conv_2d_neon_3x3.cc
浏览文件 @
4410ecd2
...
...
@@ -27,10 +27,8 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW
int
input_channels
=
input_shape
[
1
];
int
input_height
=
input_shape
[
2
];
int
input_width
=
input_shape
[
3
];
int
multiplier
=
filter_shape
==
nullptr
?
0
:
filter_shape
[
0
];
int
filter_in_channels
=
filter_shape
==
nullptr
?
input_channels
:
1
;
int
multiplier
=
filter_shape
==
nullptr
?
0
:
filter_shape
[
0
];
int
filter_in_channels
=
filter_shape
==
nullptr
?
input_channels
:
1
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
output_batch
;
++
b
)
{
for
(
int
oc
=
0
;
oc
<
output_channels
;
++
oc
)
{
...
...
@@ -230,10 +228,8 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW
int
input_channels
=
input_shape
[
1
];
int
input_height
=
input_shape
[
2
];
int
input_width
=
input_shape
[
3
];
int
multiplier
=
filter_shape
==
nullptr
?
0
:
filter_shape
[
0
];
int
filter_in_channels
=
filter_shape
==
nullptr
?
input_channels
:
1
;
int
multiplier
=
filter_shape
==
nullptr
?
0
:
filter_shape
[
0
];
int
filter_in_channels
=
filter_shape
==
nullptr
?
input_channels
:
1
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
output_batch
;
++
b
)
{
...
...
mace/kernels/neon/depthwise_conv_neon.cc
浏览文件 @
4410ecd2
...
...
@@ -52,9 +52,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
<<
"filter"
<<
kernel_h
<<
"x"
<<
kernel_w
<<
","
<<
" stride "
<<
strides_
[
0
]
<<
"x"
<<
strides_
[
1
]
<<
" is not implemented yet, using slow version"
;
DepthwiseConv2dFunctor
<
DeviceType
::
CPU
,
float
>
(
strides_
,
paddings_
,
dilations_
)(
input
,
filter
,
bias
,
output
,
future
);
DepthwiseConv2dFunctor
<
DeviceType
::
CPU
,
float
>
(
strides_
,
paddings_
,
dilations_
)(
input
,
filter
,
bias
,
output
,
future
);
return
;
}
...
...
@@ -73,8 +72,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
input_shape
=
padded_input
.
shape
().
data
();
}
auto
conv2d_neon_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
conv2d_neon_func
(
input_ptr
,
input_shape
,
filter_ptr
,
filter_shape
,
bias_ptr
,
output_ptr
,
output_shape
);
conv2d_neon_func
(
input_ptr
,
input_shape
,
filter_ptr
,
filter_shape
,
bias_ptr
,
output_
ptr
,
output_
shape
);
}
}
// namespace kernels
...
...
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -57,8 +57,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
int
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
if
(
activation_
==
PRELU
)
{
...
...
@@ -74,8 +73,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
}
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
4410ecd2
...
...
@@ -5,8 +5,8 @@
#include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -57,31 +57,23 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t
idx
=
0
;
for
(
auto
input
:
input_tensors
)
{
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
}
kernel_
.
setArg
(
idx
++
,
*
(
output_tensor
->
opencl_image
()));
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)
};
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
,
1
};
std
::
stringstream
ss
;
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
];
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
];
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
};
template
struct
AddNFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
AddNFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
AddNFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
AddNFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -60,17 +60,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
scale
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
offset
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
offset
->
opencl_image
()));
if
(
!
folded_constant_
)
{
kernel_
.
setArg
(
idx
++
,
*
(
mean
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
mean
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
var
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
epsilon
);
}
...
...
mace/kernels/opencl/bias_add_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -12,8 +12,7 @@ namespace mace {
namespace
kernels
{
template
<
typename
T
>
void
BiasAddFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
void
BiasAddFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
...
...
@@ -47,10 +46,8 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
...
...
@@ -62,9 +59,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
}
}
template
struct
BiasAddFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
BiasAddFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
BiasAddFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
BiasAddFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
4410ecd2
...
...
@@ -9,36 +9,33 @@
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
void
BufferToImageFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
Tensor
*
buffer
,
const
BufferType
type
,
Tensor
*
image
,
StatsFuture
*
future
)
{
template
<
typename
T
>
void
BufferToImageFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
Tensor
*
buffer
,
const
BufferType
type
,
Tensor
*
image
,
StatsFuture
*
future
)
{
std
::
vector
<
size_t
>
image_shape
;
if
(
!
i2b_
)
{
CalImage2DShape
(
buffer
->
shape
(),
type
,
image_shape
);
if
(
type
==
WINOGRAD_FILTER
)
{
std
::
vector
<
index_t
>
new_shape
=
CalWinogradShape
(
buffer
->
shape
(),
type
);
if
(
type
==
WINOGRAD_FILTER
)
{
std
::
vector
<
index_t
>
new_shape
=
CalWinogradShape
(
buffer
->
shape
(),
type
);
image
->
ResizeImage
(
new_shape
,
image_shape
);
}
else
{
image
->
ResizeImage
(
buffer
->
shape
(),
image_shape
);
}
}
else
{
Image
*
image_buf
=
dynamic_cast
<
Image
*>
(
image
->
UnderlyingBuffer
());
Image
*
image_buf
=
dynamic_cast
<
Image
*>
(
image
->
UnderlyingBuffer
());
image_shape
=
image_buf
->
image_shape
();
buffer
->
Resize
(
image
->
shape
());
}
size_t
gws
[
2
]
=
{
image_shape
[
0
],
image_shape
[
1
]};
size_t
gws
[
2
]
=
{
image_shape
[
0
],
image_shape
[
1
]};
std
::
string
kernel_name
;
switch
(
type
)
{
case
CONV2D_FILTER
:
kernel_name
=
i2b_
?
"filter_image_to_buffer"
:
"filter_buffer_to_image"
;
break
;
case
DW_CONV2D_FILTER
:
kernel_name
=
i2b_
?
"dw_filter_image_to_buffer"
:
"dw_filter_buffer_to_image"
;
kernel_name
=
i2b_
?
"dw_filter_image_to_buffer"
:
"dw_filter_buffer_to_image"
;
break
;
case
IN_OUT_CHANNEL
:
kernel_name
=
i2b_
?
"in_out_image_to_buffer"
:
"in_out_buffer_to_image"
;
...
...
@@ -48,7 +45,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break
;
case
IN_OUT_HEIGHT
:
case
WEIGHT_HEIGHT
:
kernel_name
=
i2b_
?
"in_out_height_image_to_buffer"
:
"in_out_height_buffer_to_image"
;
kernel_name
=
i2b_
?
"in_out_height_image_to_buffer"
:
"in_out_height_buffer_to_image"
;
break
;
case
IN_OUT_WIDTH
:
MACE_CHECK
(
!
i2b_
)
<<
"IN_OUT_WIDTH only support buffer to image now"
;
...
...
@@ -56,7 +54,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
break
;
case
WINOGRAD_FILTER
:
gws
[
1
]
/=
16
;
kernel_name
=
i2b_
?
"winograd_filter_image_to_buffer"
:
"winograd_filter_buffer_to_image"
;
kernel_name
=
i2b_
?
"winograd_filter_image_to_buffer"
:
"winograd_filter_buffer_to_image"
;
break
;
}
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
...
...
@@ -66,25 +65,30 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
built_options
.
emplace
(
kernel_name_ss
.
str
());
if
(
buffer
->
dtype
()
==
image
->
dtype
())
{
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
}
else
{
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
}
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
obfuscated_kernel_name
,
built_options
);
uint32_t
idx
=
0
;
b2f_kernel
.
setArg
(
idx
++
,
*
(
buffer
->
opencl_buffer
()));
if
(
!
i2b_
)
{
MACE_CHECK
(
buffer
->
buffer_offset
()
%
GetEnumTypeSize
(
buffer
->
dtype
())
==
0
,
"buffer offset not aligned"
);
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
buffer_offset
()
/
GetEnumTypeSize
(
buffer
->
dtype
())));
MACE_CHECK
(
buffer
->
buffer_offset
()
%
GetEnumTypeSize
(
buffer
->
dtype
())
==
0
,
"buffer offset not aligned"
);
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
buffer_offset
()
/
GetEnumTypeSize
(
buffer
->
dtype
())));
}
if
(
type
==
ARGUMENT
)
{
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
0
)));
}
else
if
(
type
==
WEIGHT_HEIGHT
)
{
}
else
if
(
type
==
WEIGHT_HEIGHT
)
{
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
0
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
1
)));
b2f_kernel
.
setArg
(
idx
++
,
1
);
...
...
@@ -97,10 +101,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
64
};
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
...
...
mace/kernels/opencl/cl/common.h
浏览文件 @
4410ecd2
...
...
@@ -18,8 +18,8 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
__constant
sampler_t
SAMPLER
=
CLK_NORMALIZED_COORDS_FALSE
|
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
__constant
sampler_t
SAMPLER
=
CLK_NORMALIZED_COORDS_FALSE
|
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
inline
DATA_TYPE4
do_activation
(
DATA_TYPE4
in
,
#ifdef USE_PRELU
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
4410ecd2
...
...
@@ -5,8 +5,8 @@
#include "mace/kernels/concat.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -42,24 +42,23 @@ static void Concat2(cl::Kernel *kernel,
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input0
->
opencl_image
())));
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input1
->
opencl_image
())));
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input0
->
opencl_image
())));
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input1
->
opencl_image
())));
kernel
->
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input0
->
dim
(
3
)));
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
opencl_image
())));
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
opencl_image
())));
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
*
kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
}
...
...
@@ -97,22 +96,20 @@ static void ConcatN(cl::Kernel *kernel,
index_t
input_channel_blk
=
input
->
dim
(
3
)
/
4
;
chan_blk_offset
+=
input_channel_blk
;
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
input_channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
input_channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"concat_n_opencl_kernel_"
<<
input_channel_blk
<<
"_"
<<
width
<<
"_"
ss
<<
"concat_n_opencl_kernel_"
<<
input_channel_blk
<<
"_"
<<
width
<<
"_"
<<
batch
*
height
;
TuningOrRun3DKernel
(
*
kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
}
}
template
<
typename
T
>
void
ConcatFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_list
,
template
<
typename
T
>
void
ConcatFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_list
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
int
inputs_count
=
input_list
.
size
();
...
...
@@ -137,7 +134,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
}
output_shape
[
axis_
]
+=
input
->
dim
(
axis_
);
}
MACE_CHECK
(
inputs_count
==
2
||
divisible_four
,
MACE_CHECK
(
inputs_count
==
2
||
divisible_four
,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2."
);
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
image_shape
);
...
...
@@ -151,17 +149,14 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
default:
if
(
divisible_four
)
{
ConcatN
(
&
kernel_
,
input_list
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
}
else
{
}
else
{
MACE_NOT_IMPLEMENTED
;
}
}
};
template
struct
ConcatFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
ConcatFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
ConcatFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
ConcatFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -47,21 +47,21 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
Tensor
*
output
,
StatsFuture
*
future
);
template
<
typename
T
>
template
<
typename
T
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
typedef
void
(
*
Conv2dOpenclFunction
)(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
stride
,
const
int
*
padding
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
stride
,
const
int
*
padding
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
index_t
kernel_h
=
filter
->
dim
(
0
);
index_t
kernel_w
=
filter
->
dim
(
1
);
...
...
@@ -83,8 +83,9 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
CalcOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
FLOOR
,
output_shape
.
data
());
CalcOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
FLOOR
,
output_shape
.
data
());
}
std
::
vector
<
size_t
>
output_image_shape
;
...
...
@@ -94,18 +95,18 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if
(
kernel_h
==
kernel_w
&&
kernel_h
<=
5
&&
selector
[
kernel_h
-
1
]
!=
nullptr
)
{
auto
conv2d_func
=
selector
[
kernel_h
-
1
];
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
}
else
{
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
}
}
template
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
4410ecd2
...
...
@@ -66,20 +66,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
}
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
kernel
->
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
}
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
// FIXME handle flexable data type: half not supported
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
input_height
));
...
...
@@ -100,6 +95,5 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
}
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
4410ecd2
...
...
@@ -61,20 +61,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
}
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
kernel
->
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
}
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
1
)));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
2
)));
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
4410ecd2
...
...
@@ -61,20 +61,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
}
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
kernel
->
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
}
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)));
kernel
->
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)));
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -34,7 +34,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
const
index_t
input_batch
=
input
->
dim
(
0
);
const
index_t
input_height
=
input
->
dim
(
1
);
const
index_t
input_width
=
input
->
dim
(
2
);
...
...
@@ -78,18 +78,16 @@ void DepthwiseConv2d(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
kernel
->
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
}
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
static_cast
<
short
>
(
input_height
));
kernel
->
setArg
(
idx
++
,
static_cast
<
short
>
(
input_width
));
...
...
@@ -154,16 +152,17 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
CalcOutputSize
(
input
->
shape
().
data
(),
fake_filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
FLOOR
,
output_shape
.
data
());
CalcOutputSize
(
input
->
shape
().
data
(),
fake_filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
FLOOR
,
output_shape
.
data
());
}
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
}
...
...
mace/kernels/opencl/eltwise_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -15,7 +15,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
const
Tensor
*
input1
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
index_t
batch
=
input0
->
dim
(
0
);
const
index_t
height
=
input0
->
dim
(
1
);
const
index_t
width
=
input0
->
dim
(
2
);
...
...
@@ -38,10 +37,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_
=
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input0
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input1
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input0
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input1
->
opencl_image
()));
if
(
!
coeff_
.
empty
())
{
kernel_
.
setArg
(
idx
++
,
coeff_
[
0
]);
kernel_
.
setArg
(
idx
++
,
coeff_
[
1
]);
...
...
@@ -49,17 +46,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)
};
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
,
1
};
std
::
stringstream
ss
;
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
...
...
mace/kernels/opencl/fully_connected_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -10,14 +10,13 @@
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
template
<
typename
T
>
void
FullyConnectedFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
=
{
input
->
dim
(
0
),
1
,
1
,
weight
->
dim
(
0
)};
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
output_image_shape
);
...
...
@@ -57,19 +56,16 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
kernel_
.
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
}
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
3
)));
...
...
@@ -78,25 +74,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
64
,
1
};
std
::
stringstream
ss
;
ss
<<
"fc_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
ss
<<
"fc_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
};
template
struct
FullyConnectedFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
FullyConnectedFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
FullyConnectedFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
FullyConnectedFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/helper.cc
浏览文件 @
4410ecd2
...
...
@@ -3,8 +3,8 @@
//
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -28,7 +28,8 @@ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
}
// [H * W * M, (Ic + 3) / 4]
void
CalDepthwiseConv2dFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HWIM */
void
CalDepthwiseConv2dFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HWIM */
std
::
vector
<
size_t
>
&
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
image_shape
.
resize
(
2
);
...
...
@@ -47,7 +48,8 @@ void CalArgImageShape(const std::vector<index_t> &shape,
// Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc]
void
CalWinogradFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* Oc, Ic, H, W*/
void
CalWinogradFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* Oc, Ic, H, W*/
std
::
vector
<
size_t
>
&
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
image_shape
.
resize
(
2
);
...
...
@@ -115,15 +117,12 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
}
}
std
::
vector
<
index_t
>
CalWinogradShape
(
const
std
::
vector
<
index_t
>
&
shape
,
const
BufferType
type
)
{
if
(
type
==
WINOGRAD_FILTER
)
{
return
{
16
,
shape
[
0
],
shape
[
1
],
1
};
}
else
if
(
type
==
IN_OUT_HEIGHT
)
{
index_t
out_width
=
shape
[
0
]
*
((
shape
[
1
]
-
1
)
/
2
)
*
((
shape
[
2
]
-
1
)
/
2
);
}
else
if
(
type
==
IN_OUT_HEIGHT
)
{
index_t
out_width
=
shape
[
0
]
*
((
shape
[
1
]
-
1
)
/
2
)
*
((
shape
[
2
]
-
1
)
/
2
);
return
{
16
,
shape
[
3
],
out_width
,
1
};
}
else
{
LOG
(
FATAL
)
<<
"Mace not supported yet."
;
...
...
@@ -188,8 +187,8 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
// TODO tuning these magic numbers
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
...
...
@@ -217,20 +216,20 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
,
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
,
std
::
vector
<
uint32_t
>
*
tuning_result
)
->
cl_int
{
MACE_CHECK
(
params
.
size
()
==
4
)
<<
"Tuning parameters of 3D kernel must be 4D"
;
MACE_CHECK
(
params
.
size
()
==
4
)
<<
"Tuning parameters of 3D kernel must be 4D"
;
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
[
3
];
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
...
...
@@ -247,15 +246,16 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
if
(
LimitKernelTime
())
{
double
elapse_time
=
timer
->
AccumulatedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
(
*
tuning_result
)[
3
]
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
...
...
@@ -300,34 +300,30 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
{
kwg_size
/
256
,
256
,
1
},
{
kwg_size
/
512
,
512
,
1
},
{
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
}
};
{
1
,
kwg_size
,
1
}};
};
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
,
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
,
std
::
vector
<
uint32_t
>
*
tuning_result
)
->
cl_int
{
MACE_CHECK
(
params
.
size
()
==
3
)
<<
"Tuning parameters of 2D kernel must be 3d"
;
MACE_CHECK
(
params
.
size
()
==
3
)
<<
"Tuning parameters of 2D kernel must be 3d"
;
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
[
2
];
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
ClearTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
...
...
@@ -336,16 +332,16 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
if
(
LimitKernelTime
())
{
double
elapse_time
=
timer
->
AccumulatedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
1
]);
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
1
]);
(
*
tuning_result
)[
2
]
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
...
...
@@ -355,11 +351,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
return
error
;
};
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
...
...
@@ -368,7 +361,6 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
}
};
}
}
}
// namespace kernels
...
...
mace/kernels/opencl/helper.h
浏览文件 @
4410ecd2
...
...
@@ -5,11 +5,11 @@
#ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/types.h"
#include "mace/utils/utils.h"
#include "mace/core/future.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -48,7 +48,6 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
void
TuningOrRun2DKernel
(
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
...
...
@@ -72,12 +71,12 @@ inline bool LimitKernelTime() {
}
namespace
{
template
<
typename
T
>
template
<
typename
T
>
void
AppendToStream
(
std
::
stringstream
*
ss
,
const
std
::
string
&
delimiter
,
T
v
)
{
(
*
ss
)
<<
v
;
}
template
<
typename
T
,
typename
...
Args
>
template
<
typename
T
,
typename
...
Args
>
void
AppendToStream
(
std
::
stringstream
*
ss
,
const
std
::
string
&
delimiter
,
T
first
,
...
...
@@ -87,7 +86,7 @@ void AppendToStream(std::stringstream *ss,
}
}
// namespace
template
<
typename
...
Args
>
template
<
typename
...
Args
>
std
::
string
Concat
(
Args
...
args
)
{
std
::
stringstream
ss
;
AppendToStream
(
&
ss
,
"_"
,
args
...);
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
4410ecd2
...
...
@@ -11,12 +11,10 @@ namespace mace {
namespace
kernels
{
template
<
typename
T
>
void
MatMulFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
A
,
void
MatMulFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
A
,
const
Tensor
*
B
,
Tensor
*
C
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
c_shape
=
{
A
->
dim
(
0
),
A
->
dim
(
1
),
B
->
dim
(
2
),
1
};
std
::
vector
<
size_t
>
c_image_shape
;
CalImage2DShape
(
c_shape
,
BufferType
::
IN_OUT_HEIGHT
,
c_image_shape
);
...
...
@@ -41,8 +39,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
A
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
B
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
B
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
C
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
...
...
@@ -57,20 +54,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
};
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
64
,
1
};
std
::
stringstream
ss
;
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
};
template
struct
MatMulFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
MatMulFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
MatMulFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
MatMulFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -11,17 +11,15 @@
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
template
<
typename
T
>
void
PoolingFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_CHECK
(
dilations_
[
0
]
==
1
&&
dilations_
[
1
]
==
1
)
<<
"Pooling opencl kernel not support dilation yet"
;
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
kernels_
[
0
],
kernels_
[
1
],
input
->
dim
(
3
),
input
->
dim
(
3
)
};
std
::
vector
<
index_t
>
filter_shape
=
{
kernels_
[
0
],
kernels_
[
1
],
input
->
dim
(
3
),
input
->
dim
(
3
)};
std
::
vector
<
int
>
paddings
(
2
);
if
(
paddings_
.
empty
())
{
...
...
@@ -77,24 +75,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -2,12 +2,12 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/resize_bilinear.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -29,14 +29,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
std
::
vector
<
index_t
>
output_shape
{
batch
,
out_height
,
out_width
,
channels
};
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
float
height_scale
=
CalculateResizeScale
(
in_height
,
out_height
,
align_corners_
);
float
width_scale
=
CalculateResizeScale
(
in_width
,
out_width
,
align_corners_
);
float
width_scale
=
CalculateResizeScale
(
in_width
,
out_width
,
align_corners_
);
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
...
...
@@ -45,7 +45,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
kernel_
=
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
...
...
@@ -62,11 +63,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
static_cast
<
uint32_t
>
(
out_height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -6,13 +6,13 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
template
<
typename
T
>
void
SoftmaxFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
logits
,
Tensor
*
output
,
StatsFuture
*
future
)
{
...
...
@@ -45,17 +45,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
struct
SoftmaxFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
SoftmaxFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
SoftmaxFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
SoftmaxFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
4410ecd2
...
...
@@ -5,17 +5,18 @@
#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/space_to_batch.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
void
SpaceToBatchFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
Tensor
*
space_tensor
,
void
SpaceToBatchFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
Tensor
*
space_tensor
,
const
std
::
vector
<
index_t
>
&
output_shape
,
Tensor
*
batch_tensor
,
StatsFuture
*
future
)
{
...
...
@@ -37,8 +38,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
built_options
.
emplace
(
kernel_name_ss
.
str
());
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
kernel_
=
runtime
->
BuildKernel
(
"space_to_batch"
,
kernel_name
,
built_options
);
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
kernel_
=
runtime
->
BuildKernel
(
"space_to_batch"
,
kernel_name
,
built_options
);
uint32_t
idx
=
0
;
if
(
b2s_
)
{
...
...
@@ -59,15 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
}
const
uint32_t
chan_blk
=
RoundUpDiv4
<
uint32_t
>
(
batch_tensor
->
dim
(
3
));
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
0
)
*
batch_tensor
->
dim
(
1
))};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
kernel_name
<<
"_"
<<
batch_tensor
->
dim
(
0
)
<<
"_"
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
ss
<<
kernel_name
<<
"_"
<<
batch_tensor
->
dim
(
0
)
<<
"_"
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
<<
batch_tensor
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
4410ecd2
...
...
@@ -11,21 +11,21 @@
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
void
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
template
<
typename
T
>
void
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
input_tensor
->
dim
(
3
),
1
};
std
::
vector
<
int
>
paddings
(
2
);
if
(
paddings_
.
empty
())
{
kernels
::
CalcNHWCPaddingAndOutputSize
(
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
dilations_
.
data
(),
strides_
.
data
(),
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
dilations_
.
data
(),
strides_
.
data
(),
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
CalcOutputSize
(
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
.
data
(),
strides_
.
data
(),
RoundType
::
FLOOR
,
output_shape
.
data
());
CalcOutputSize
(
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
.
data
(),
strides_
.
data
(),
RoundType
::
FLOOR
,
output_shape
.
data
());
}
const
index_t
round_h
=
(
output_shape
[
1
]
+
1
)
/
2
;
...
...
@@ -38,14 +38,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_HEIGHT
,
image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-Dwinograd_transform_2x2="
+
obfuscated_kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
auto
runtime
=
OpenCLRuntime
::
Global
();
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
uint32_t
idx
=
0
;
...
...
@@ -60,34 +62,39 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
paddings
[
1
]
/
2
));
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
out_width
),
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))};
const
std
::
vector
<
uint32_t
>
lws
=
{
128
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"winograd_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
ss
<<
"winograd_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
<<
input_tensor
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
<
typename
T
>
void
WinogradInverseTransformFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input_tensor
,
template
<
typename
T
>
void
WinogradInverseTransformFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input_tensor
,
const
Tensor
*
bias
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
=
{
batch_
,
height_
,
width_
,
input_tensor
->
dim
(
1
)};
std
::
vector
<
index_t
>
output_shape
=
{
batch_
,
height_
,
width_
,
input_tensor
->
dim
(
1
)};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_2x2"
);
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_2x2"
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-Dwinograd_inverse_transform_2x2="
+
obfuscated_kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-Dwinograd_inverse_transform_2x2="
+
obfuscated_kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
switch
(
activation_
)
{
case
NOOP
:
...
...
@@ -112,18 +119,21 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
}
auto
runtime
=
OpenCLRuntime
::
Global
();
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
const
uint32_t
round_h
=
(
height_
+
1
)
/
2
;
const
uint32_t
round_w
=
(
width_
+
1
)
/
2
;
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input_tensor
->
opencl_image
())));
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input_tensor
->
opencl_image
())));
if
(
bias
!=
nullptr
)
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
bias
->
opencl_image
())));
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
bias
->
opencl_image
())));
}
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output_tensor
->
opencl_image
())));
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output_tensor
->
opencl_image
())));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
output_shape
[
1
]));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
output_shape
[
2
]));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_h
*
round_w
));
...
...
@@ -131,28 +141,23 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
kernel_
.
setArg
(
idx
++
,
relux_max_limit_
);
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
2
)),
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
1
)))};
const
std
::
vector
<
uint32_t
>
lws
=
{
128
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"winograd_inverse_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
ss
<<
"winograd_inverse_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
<<
input_tensor
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
struct
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
WinogradInverseTransformFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
WinogradInverseTransformFunctor
<
DeviceType
::
OPENCL
,
half
>;
template
struct
WinogradInverseTransformFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
WinogradInverseTransformFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/pooling.h
浏览文件 @
4410ecd2
...
...
@@ -7,9 +7,9 @@
#include <limits>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace
mace
{
...
...
@@ -42,7 +42,7 @@ struct PoolingFunctorBase {
const
int
*
dilations_
;
};
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
PoolingFunctor
:
PoolingFunctorBase
{
PoolingFunctor
(
const
PoolingType
pooling_type
,
const
int
*
kernels
,
...
...
@@ -50,29 +50,27 @@ struct PoolingFunctor : PoolingFunctorBase {
const
Padding
padding_type
,
const
std
::
vector
<
int
>
&
paddings
,
const
int
*
dilations
)
:
PoolingFunctorBase
(
pooling_type
,
kernels
,
strides
,
padding_type
,
paddings
,
dilations
)
{
}
:
PoolingFunctorBase
(
pooling_type
,
kernels
,
strides
,
padding_type
,
paddings
,
dilations
)
{
}
void
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
kernels_
[
0
],
kernels_
[
1
],
input_tensor
->
dim
(
3
),
input_tensor
->
dim
(
3
)
};
kernels_
[
0
],
kernels_
[
1
],
input_tensor
->
dim
(
3
),
input_tensor
->
dim
(
3
)};
std
::
vector
<
int
>
paddings
(
2
);
if
(
paddings_
.
empty
())
{
kernels
::
CalcNHWCPaddingAndOutputSize
(
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
dilations_
,
strides_
,
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
dilations_
,
strides_
,
padding_type_
,
output_shape
.
data
(),
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
CalcOutputSize
(
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
CEIL
,
output_shape
.
data
());
CalcOutputSize
(
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
paddings_
.
data
(),
dilations_
,
strides_
,
RoundType
::
CEIL
,
output_shape
.
data
());
}
output_tensor
->
Resize
(
output_shape
);
...
...
@@ -110,7 +108,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
index_t
out_offset
=
(((
b
*
height
)
+
h
)
*
width
+
w
)
*
channels
+
c
;
index_t
out_offset
=
(((
b
*
height
)
+
h
)
*
width
+
w
)
*
channels
+
c
;
index_t
in_offset
=
b
*
in_image_size
*
input_channels
+
c
;
T
res
=
std
::
numeric_limits
<
T
>::
lowest
();
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
...
...
@@ -119,7 +118,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
if
(
inh
>=
0
&&
inh
<
input_height
&&
inw
>=
0
&&
inw
<
input_width
)
{
index_t
input_offset
=
in_offset
+
(
inh
*
input_width
+
inw
)
*
input_channels
;
index_t
input_offset
=
in_offset
+
(
inh
*
input_width
+
inw
)
*
input_channels
;
res
=
std
::
max
(
res
,
input
[
input_offset
]);
}
}
...
...
@@ -135,7 +135,8 @@ struct PoolingFunctor : PoolingFunctorBase {
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
index_t
out_offset
=
(((
b
*
height
)
+
h
)
*
width
+
w
)
*
channels
+
c
;
index_t
out_offset
=
(((
b
*
height
)
+
h
)
*
width
+
w
)
*
channels
+
c
;
index_t
in_offset
=
b
*
in_image_size
*
input_channels
+
c
;
T
sum
=
0
;
int
block_size
=
0
;
...
...
@@ -145,7 +146,8 @@ struct PoolingFunctor : PoolingFunctorBase {
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
if
(
inh
>=
0
&&
inh
<
input_height
&&
inw
>=
0
&&
inw
<
input_width
)
{
index_t
input_offset
=
in_offset
+
(
inh
*
input_width
+
inw
)
*
input_channels
;
index_t
input_offset
=
in_offset
+
(
inh
*
input_width
+
inw
)
*
input_channels
;
sum
+=
input
[
input_offset
];
block_size
+=
1
;
}
...
...
@@ -158,16 +160,13 @@ struct PoolingFunctor : PoolingFunctorBase {
}
}
}
};
template
<
>
template
<
>
void
PoolingFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
);
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
);
template
<
typename
T
>
template
<
typename
T
>
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
T
>
:
PoolingFunctorBase
{
PoolingFunctor
(
const
PoolingType
pooling_type
,
const
int
*
kernels
,
...
...
@@ -175,9 +174,9 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
const
Padding
padding_type
,
const
std
::
vector
<
int
>
&
paddings
,
const
int
*
dilations
)
:
PoolingFunctorBase
(
pooling_type
,
kernels
,
strides
,
padding_type
,
paddings
,
dilations
)
{
}
:
PoolingFunctorBase
(
pooling_type
,
kernels
,
strides
,
padding_type
,
paddings
,
dilations
)
{
}
void
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
);
...
...
mace/kernels/reshape.h
浏览文件 @
4410ecd2
...
...
@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESHAPE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -25,7 +25,6 @@ struct ReshapeFunctor {
}
};
}
// namespace kernels
}
// namespace mace
...
...
mace/kernels/resize_bilinear.h
浏览文件 @
4410ecd2
...
...
@@ -5,8 +5,8 @@
#define MACE_KERNELS_RESIZE_BILINEAR_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -163,8 +163,9 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
}
};
template
<
typename
T
>
struct
ResizeBilinearFunctor
<
DeviceType
::
OPENCL
,
T
>
:
ResizeBilinearFunctorBase
{
template
<
typename
T
>
struct
ResizeBilinearFunctor
<
DeviceType
::
OPENCL
,
T
>
:
ResizeBilinearFunctorBase
{
ResizeBilinearFunctor
(
const
std
::
vector
<
index_t
>
&
size
,
bool
align_corners
)
:
ResizeBilinearFunctorBase
(
size
,
align_corners
)
{}
...
...
mace/kernels/space_to_batch.h
浏览文件 @
4410ecd2
...
...
@@ -6,9 +6,9 @@
#define MACE_KERNELS_CONV_2D_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/runtime/opencl/cl2_header.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -16,11 +16,10 @@ namespace kernels {
struct
SpaceToBatchFunctorBase
{
SpaceToBatchFunctorBase
(
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
block_shape
,
bool
b2s
)
:
paddings_
(
paddings
.
begin
(),
paddings
.
end
()),
bool
b2s
)
:
paddings_
(
paddings
.
begin
(),
paddings
.
end
()),
block_shape_
(
block_shape
.
begin
(),
block_shape
.
end
()),
b2s_
(
b2s
)
{}
b2s_
(
b2s
)
{}
std
::
vector
<
int
>
paddings_
;
std
::
vector
<
int
>
block_shape_
;
...
...
@@ -28,10 +27,11 @@ struct SpaceToBatchFunctorBase {
};
template
<
DeviceType
D
,
typename
T
>
struct
SpaceToBatchFunctor
:
SpaceToBatchFunctorBase
{
struct
SpaceToBatchFunctor
:
SpaceToBatchFunctorBase
{
SpaceToBatchFunctor
(
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
block_shape
,
bool
b2s
)
:
SpaceToBatchFunctorBase
(
paddings
,
block_shape
,
b2s
){}
bool
b2s
)
:
SpaceToBatchFunctorBase
(
paddings
,
block_shape
,
b2s
)
{}
void
operator
()(
Tensor
*
space_tensor
,
const
std
::
vector
<
index_t
>
&
output_shape
,
...
...
@@ -42,10 +42,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
};
template
<
typename
T
>
struct
SpaceToBatchFunctor
<
DeviceType
::
OPENCL
,
T
>
:
SpaceToBatchFunctorBase
{
struct
SpaceToBatchFunctor
<
DeviceType
::
OPENCL
,
T
>
:
SpaceToBatchFunctorBase
{
SpaceToBatchFunctor
(
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
block_shape
,
bool
b2s
)
:
SpaceToBatchFunctorBase
(
paddings
,
block_shape
,
b2s
){}
bool
b2s
)
:
SpaceToBatchFunctorBase
(
paddings
,
block_shape
,
b2s
)
{}
void
operator
()(
Tensor
*
space_tensor
,
const
std
::
vector
<
index_t
>
&
output_shape
,
...
...
@@ -53,7 +54,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
};
}
// namespace kernels
...
...
mace/kernels/winograd_transform.h
浏览文件 @
4410ecd2
...
...
@@ -6,10 +6,10 @@
#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/activation.h"
#include "mace/
core/runtime/opencl/cl2_header
.h"
#include "mace/
kernels/conv_pool_2d_util
.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -17,8 +17,10 @@ namespace kernels {
struct
WinogradTransformFunctorBase
{
WinogradTransformFunctorBase
(
const
Padding
&
padding_type
,
const
std
::
vector
<
int
>
&
paddings
)
:
strides_
({
1
,
1
}),
dilations_
({
1
,
1
}),
padding_type_
(
padding_type
),
paddings_
(
paddings
)
{}
:
strides_
({
1
,
1
}),
dilations_
({
1
,
1
}),
padding_type_
(
padding_type
),
paddings_
(
paddings
)
{}
const
std
::
vector
<
int
>
strides_
;
// [stride_h, stride_w]
const
std
::
vector
<
int
>
dilations_
;
// [dilation_h, dilation_w]
...
...
@@ -26,29 +28,25 @@ struct WinogradTransformFunctorBase {
std
::
vector
<
int
>
paddings_
;
};
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
WinogradTransformFunctor
:
WinogradTransformFunctorBase
{
WinogradTransformFunctor
(
const
Padding
&
padding_type
,
const
std
::
vector
<
int
>
&
paddings
)
:
WinogradTransformFunctorBase
(
padding_type
,
paddings
)
{}
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_NOT_IMPLEMENTED
;
}
};
template
<
typename
T
>
struct
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
T
>
:
WinogradTransformFunctorBase
{
template
<
typename
T
>
struct
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
T
>
:
WinogradTransformFunctorBase
{
WinogradTransformFunctor
(
const
Padding
&
padding_type
,
const
std
::
vector
<
int
>
&
paddings
)
:
WinogradTransformFunctorBase
(
padding_type
,
paddings
)
{}
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
};
...
...
@@ -72,14 +70,15 @@ struct WinogradInverseTransformFunctorBase {
const
float
relux_max_limit_
;
};
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
WinogradInverseTransformFunctor
:
WinogradInverseTransformFunctorBase
{
WinogradInverseTransformFunctor
(
const
int
batch
,
const
int
height
,
const
int
width
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
:
WinogradInverseTransformFunctorBase
(
batch
,
height
,
width
,
activation
,
relux_max_limit
)
{}
:
WinogradInverseTransformFunctorBase
(
batch
,
height
,
width
,
activation
,
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
bias
,
...
...
@@ -87,17 +86,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
StatsFuture
*
future
)
{
MACE_NOT_IMPLEMENTED
;
}
};
template
<
typename
T
>
struct
WinogradInverseTransformFunctor
<
DeviceType
::
OPENCL
,
T
>
:
WinogradInverseTransformFunctorBase
{
template
<
typename
T
>
struct
WinogradInverseTransformFunctor
<
DeviceType
::
OPENCL
,
T
>
:
WinogradInverseTransformFunctorBase
{
WinogradInverseTransformFunctor
(
const
int
batch
,
const
int
height
,
const
int
width
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
:
WinogradInverseTransformFunctorBase
(
batch
,
height
,
width
,
activation
,
relux_max_limit
)
{}
:
WinogradInverseTransformFunctorBase
(
batch
,
height
,
width
,
activation
,
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
bias
,
...
...
mace/ops/activation.h
浏览文件 @
4410ecd2
...
...
@@ -22,7 +22,8 @@ class ActivationOp : public Operator<D, T> {
bool
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input_tensor
=
this
->
Input
(
0
);
const
Tensor
*
alpha_tensor
=
this
->
InputSize
()
>=
2
?
this
->
Input
(
1
)
:
nullptr
;
const
Tensor
*
alpha_tensor
=
this
->
InputSize
()
>=
2
?
this
->
Input
(
1
)
:
nullptr
;
Tensor
*
output_tensor
=
this
->
outputs_
[
0
];
output_tensor
->
ResizeLike
(
input_tensor
);
...
...
mace/ops/activation_test.cc
浏览文件 @
4410ecd2
...
...
@@ -214,9 +214,7 @@ void TestSimplePrelu() {
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
2
,
2
,
2
,
2
},
{
-
7
,
7
,
-
6
,
6
,
-
5
,
-
5
,
-
4
,
-
4
,
-
3
,
3
,
-
2
,
2
,
-
1
,
-
1
,
0
,
0
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Alpha"
,
{
2
},
{
2.0
,
3.0
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Alpha"
,
{
2
},
{
2.0
,
3.0
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
,
float
>
(
net
,
"Input"
,
"InputImage"
,
...
...
@@ -250,7 +248,8 @@ void TestSimplePrelu() {
}
auto
expected
=
CreateTensor
<
float
>
(
{
2
,
2
,
2
,
2
},
{
-
14
,
7
,
-
12
,
6
,
-
10
,
-
15
,
-
8
,
-
12
,
-
6
,
3
,
-
4
,
2
,
-
2
,
-
3
,
0
,
0
});
{
2
,
2
,
2
,
2
},
{
-
14
,
7
,
-
12
,
6
,
-
10
,
-
15
,
-
8
,
-
12
,
-
6
,
3
,
-
4
,
2
,
-
2
,
-
3
,
0
,
0
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
...
...
mace/ops/addn.h
浏览文件 @
4410ecd2
...
...
@@ -26,12 +26,10 @@ class AddNOp : public Operator<D, T> {
for
(
int
i
=
1
;
i
<
n
;
++
i
)
{
inputs
[
i
]
=
this
->
Input
(
i
);
MACE_CHECK
(
inputs
[
0
]
->
dim_size
()
==
inputs
[
i
]
->
dim_size
());
MACE_CHECK
(
inputs
[
0
]
->
size
()
==
inputs
[
i
]
->
size
())
<<
"Input 0: "
<<
MakeString
(
inputs
[
0
]
->
shape
())
<<
", size: "
<<
inputs
[
0
]
->
size
()
<<
". Input "
<<
i
<<
": "
<<
MakeString
(
inputs
[
i
]
->
shape
())
<<
", size: "
<<
inputs
[
i
]
->
size
();
MACE_CHECK
(
inputs
[
0
]
->
size
()
==
inputs
[
i
]
->
size
())
<<
"Input 0: "
<<
MakeString
(
inputs
[
0
]
->
shape
())
<<
", size: "
<<
inputs
[
0
]
->
size
()
<<
". Input "
<<
i
<<
": "
<<
MakeString
(
inputs
[
i
]
->
shape
())
<<
", size: "
<<
inputs
[
i
]
->
size
();
}
functor_
(
inputs
,
output_tensor
,
future
);
...
...
mace/ops/addn_benchmark.cc
浏览文件 @
4410ecd2
...
...
@@ -15,8 +15,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
OpsTestNet
net
;
// Add input data
for
(
int
i
=
0
;
i
<
inputs
;
++
i
)
{
net
.
AddRandomInput
<
D
,
float
>
(
MakeString
(
"Input"
,
i
).
c_str
(),
{
n
,
h
,
w
,
c
});
net
.
AddRandomInput
<
D
,
float
>
(
MakeString
(
"Input"
,
i
).
c_str
(),
{
n
,
h
,
w
,
c
});
}
if
(
D
==
DeviceType
::
OPENCL
)
{
...
...
mace/ops/batch_norm_benchmark.cc
浏览文件 @
4410ecd2
mace/ops/batch_to_space.h
浏览文件 @
4410ecd2
...
...
@@ -12,13 +12,12 @@
namespace
mace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
class
BatchToSpaceNDOp
:
public
Operator
<
D
,
T
>
{
public:
BatchToSpaceNDOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"crops"
,
{
0
,
0
,
0
,
0
}),
functor_
(
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"crops"
,
{
0
,
0
,
0
,
0
}),
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"block_shape"
,
{
1
,
1
}),
true
)
{}
...
...
@@ -28,7 +27,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
std
::
vector
<
index_t
>
output_shape
(
4
,
0
);
CalculateOutputShape
(
batch_tensor
,
space_tensor
,
output_shape
.
data
());
functor_
(
space_tensor
,
output_shape
,
const_cast
<
Tensor
*>
(
batch_tensor
),
future
);
functor_
(
space_tensor
,
output_shape
,
const_cast
<
Tensor
*>
(
batch_tensor
),
future
);
return
true
;
}
...
...
@@ -37,7 +37,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
Tensor
*
output
,
index_t
*
output_shape
)
{
auto
crops
=
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"crops"
,
{
0
,
0
,
0
,
0
});
auto
block_shape
=
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"block_shape"
,
{
1
,
1
});
auto
block_shape
=
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"block_shape"
,
{
1
,
1
});
MACE_CHECK
(
input_tensor
->
dim_size
()
==
4
,
"Input's shape should be 4D"
);
MACE_CHECK
(
block_shape
.
size
()
==
2
,
"Block's shape should be 1D"
);
MACE_CHECK
(
crops
.
size
()
==
4
,
"Crops' shape should be 2D"
);
...
...
@@ -45,13 +46,13 @@ class BatchToSpaceNDOp : public Operator<D, T> {
const
index_t
block_dims
=
block_shape
.
size
();
index_t
block_shape_product
=
1
;
for
(
uint32_t
block_dim
=
0
;
block_dim
<
block_dims
;
++
block_dim
)
{
MACE_CHECK
(
block_shape
[
block_dim
]
>
1
,
"block_shape's value should be great to 1"
);
MACE_CHECK
(
block_shape
[
block_dim
]
>
1
,
"block_shape's value should be great to 1"
);
const
index_t
block_shape_value
=
block_shape
[
block_dim
];
const
index_t
cropped_input_size
=
input_tensor
->
dim
(
block_dim
+
1
)
*
block_shape_value
-
crops
[
block_dim
*
2
]
-
crops
[
block_dim
*
2
+
1
];
MACE_CHECK
(
cropped_input_size
>=
0
,
"cropped size must be non-negative"
);
const
index_t
cropped_input_size
=
input_tensor
->
dim
(
block_dim
+
1
)
*
block_shape_value
-
crops
[
block_dim
*
2
]
-
crops
[
block_dim
*
2
+
1
];
MACE_CHECK
(
cropped_input_size
>=
0
,
"cropped size must be non-negative"
);
block_shape_product
*=
block_shape_value
;
output_shape
[
block_dim
+
1
]
=
cropped_input_size
;
}
...
...
mace/ops/batch_to_space_benchmark.cc
浏览文件 @
4410ecd2
mace/ops/bias_add_benchmark.cc
浏览文件 @
4410ecd2
mace/ops/buffer_to_image.h
浏览文件 @
4410ecd2
...
...
@@ -11,7 +11,7 @@
namespace
mace
{
template
<
DeviceType
D
,
typename
T
>
class
BufferToImageOp
:
public
Operator
<
D
,
T
>
{
class
BufferToImageOp
:
public
Operator
<
D
,
T
>
{
public:
BufferToImageOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
)
{}
...
...
@@ -19,7 +19,8 @@ class BufferToImageOp: public Operator<D, T> {
bool
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input_tensor
=
this
->
Input
(
INPUT
);
kernels
::
BufferType
type
=
static_cast
<
kernels
::
BufferType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
kernels
::
BufferType
type
=
static_cast
<
kernels
::
BufferType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"buffer_type"
,
static_cast
<
int
>
(
kernels
::
CONV2D_FILTER
)));
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
...
...
mace/ops/buffer_to_image_test.cc
浏览文件 @
4410ecd2
...
...
@@ -7,8 +7,9 @@
using
namespace
mace
;
template
<
DeviceType
D
,
typename
T
>
void
TestBidirectionTransform
(
const
int
type
,
const
std
::
vector
<
index_t
>
&
input_shape
)
{
template
<
DeviceType
D
,
typename
T
>
void
TestBidirectionTransform
(
const
int
type
,
const
std
::
vector
<
index_t
>
&
input_shape
)
{
OpsTestNet
net
;
OpDefBuilder
(
"BufferToImage"
,
"BufferToImageTest"
)
.
Input
(
"Input"
)
...
...
@@ -34,7 +35,8 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
net
.
RunOp
(
D
);
// Check
ExpectTensorNear
<
T
>
(
*
net
.
GetOutput
(
"Input"
),
*
net
.
GetOutput
(
"I2BOutput"
),
1e-5
);
ExpectTensorNear
<
T
>
(
*
net
.
GetOutput
(
"Input"
),
*
net
.
GetOutput
(
"I2BOutput"
),
1e-5
);
}
TEST
(
BufferToImageTest
,
ArgSmall
)
{
...
...
@@ -54,51 +56,63 @@ TEST(BufferToImageTest, ArgLarge) {
}
TEST
(
BufferToImageTest
,
InputSmallSingleChannel
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
1
,
2
,
3
,
1
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
1
,
2
,
3
,
1
});
}
TEST
(
BufferToImageTest
,
InputSmallMultipleChannel
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
1
,
2
,
3
,
3
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
1
,
2
,
3
,
3
});
}
TEST
(
BufferToImageTest
,
InputSmallMultipleBatchAndChannel
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
3
,
2
,
3
,
3
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
3
,
2
,
3
,
3
});
}
TEST
(
BufferToImageTest
,
InputMedia
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
3
,
13
,
17
,
128
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
3
,
13
,
17
,
128
});
}
TEST
(
BufferToImageTest
,
InputLarge
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
3
,
64
,
64
,
256
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
IN_OUT_CHANNEL
,
{
3
,
64
,
64
,
256
});
}
TEST
(
BufferToImageTest
,
Filter1x1Small
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
1
,
1
,
3
,
5
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
1
,
1
,
3
,
5
});
}
TEST
(
BufferToImageTest
,
Filter1x1Media
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
1
,
1
,
13
,
17
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
1
,
1
,
13
,
17
});
}
TEST
(
BufferToImageTest
,
Filter1x1Large
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
1
,
1
,
128
,
512
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
1
,
1
,
128
,
512
});
}
TEST
(
BufferToImageTest
,
Filter3x3Small
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
3
,
3
,
3
,
5
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
3
,
3
,
3
,
5
});
}
TEST
(
BufferToImageTest
,
Filter3x3Meida
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
3
,
3
,
13
,
17
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
3
,
3
,
13
,
17
});
}
TEST
(
BufferToImageTest
,
Filter3x3Large
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
3
,
3
,
128
,
256
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
CONV2D_FILTER
,
{
3
,
3
,
128
,
256
});
}
template
<
DeviceType
D
,
typename
T
>
void
TestDiffTypeBidirectionTransform
(
const
int
type
,
const
std
::
vector
<
index_t
>
&
input_shape
)
{
template
<
DeviceType
D
,
typename
T
>
void
TestDiffTypeBidirectionTransform
(
const
int
type
,
const
std
::
vector
<
index_t
>
&
input_shape
)
{
OpsTestNet
net
;
OpDefBuilder
(
"BufferToImage"
,
"BufferToImageTest"
)
.
Input
(
"Input"
)
...
...
@@ -123,14 +137,16 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t>
net
.
RunOp
(
D
);
// Check
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"Input"
),
*
net
.
GetOutput
(
"I2BOutput"
),
1e-2
);
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"Input"
),
*
net
.
GetOutput
(
"I2BOutput"
),
1e-2
);
}
TEST
(
BufferToImageTest
,
ArgFloatToHalfSmall
)
{
TestDiffTypeBidirectionTransform
<
DeviceType
::
OPENCL
,
half
>
(
kernels
::
ARGUMENT
,
{
11
});
TestDiffTypeBidirectionTransform
<
DeviceType
::
OPENCL
,
half
>
(
kernels
::
ARGUMENT
,
{
11
});
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestStringHalfBidirectionTransform
(
const
int
type
,
const
std
::
vector
<
index_t
>
&
input_shape
,
const
unsigned
char
*
input_data
)
{
...
...
@@ -142,9 +158,10 @@ void TestStringHalfBidirectionTransform(const int type,
.
AddIntArg
(
"T"
,
DataTypeToEnum
<
T
>::
value
)
.
Finalize
(
net
.
NewOperatorDef
());
const
half
*
h_data
=
reinterpret_cast
<
const
half
*>
(
input_data
);
const
half
*
h_data
=
reinterpret_cast
<
const
half
*>
(
input_data
);
net
.
AddInputFromArray
<
D
,
half
>
(
"Input"
,
input_shape
,
std
::
vector
<
half
>
(
h_data
,
h_data
+
2
));
net
.
AddInputFromArray
<
D
,
half
>
(
"Input"
,
input_shape
,
std
::
vector
<
half
>
(
h_data
,
h_data
+
2
));
// Run
net
.
RunOp
(
D
);
...
...
@@ -160,12 +177,14 @@ void TestStringHalfBidirectionTransform(const int type,
net
.
RunOp
(
D
);
// Check
ExpectTensorNear
<
half
>
(
*
net
.
GetOutput
(
"Input"
),
*
net
.
GetOutput
(
"I2BOutput"
),
1e-2
);
ExpectTensorNear
<
half
>
(
*
net
.
GetOutput
(
"Input"
),
*
net
.
GetOutput
(
"I2BOutput"
),
1e-2
);
}
TEST
(
BufferToImageTest
,
ArgStringHalfToHalfSmall
)
{
const
unsigned
char
input_data
[]
=
{
0xCD
,
0x3C
,
0x33
,
0x40
,};
TestStringHalfBidirectionTransform
<
DeviceType
::
OPENCL
,
half
>
(
kernels
::
ARGUMENT
,
{
2
},
input_data
);
const
unsigned
char
input_data
[]
=
{
0xCD
,
0x3C
,
0x33
,
0x40
,
};
TestStringHalfBidirectionTransform
<
DeviceType
::
OPENCL
,
half
>
(
kernels
::
ARGUMENT
,
{
2
},
input_data
);
}
mace/ops/channel_shuffle.h
浏览文件 @
4410ecd2
...
...
@@ -28,8 +28,8 @@ class ChannelShuffleOp : public Operator<D, T> {
input
->
shape
()[
1
]);
output
->
ResizeLike
(
input
);
functor_
(
input
->
data
<
T
>
(),
input
->
shape
().
data
(),
output
->
mutable_data
<
T
>
(),
future
);
functor_
(
input
->
data
<
T
>
(),
input
->
shape
().
data
(),
output
->
mutable_data
<
T
>
(),
future
);
return
true
;
}
...
...
mace/ops/channel_shuffle_benchmark.cc
浏览文件 @
4410ecd2
mace/ops/concat.h
浏览文件 @
4410ecd2
...
...
@@ -14,10 +14,11 @@ class ConcatOp : public Operator<D, T> {
public:
ConcatOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"axis"
,
3
)){}
functor_
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"axis"
,
3
))
{}
bool
Run
(
StatsFuture
*
future
)
override
{
MACE_CHECK
(
this
->
InputSize
()
>=
2
)
<<
"There must be at least two inputs to concat"
;
MACE_CHECK
(
this
->
InputSize
()
>=
2
)
<<
"There must be at least two inputs to concat"
;
const
std
::
vector
<
const
Tensor
*>
input_list
=
this
->
Inputs
();
const
int32_t
concat_axis
=
OperatorBase
::
GetSingleArgument
<
int
>
(
"axis"
,
3
);
const
int32_t
input_dims
=
input_list
[
0
]
->
dim_size
();
...
...
mace/ops/concat_benchmark.cc
浏览文件 @
4410ecd2
...
...
@@ -38,8 +38,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
}
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1( \
int iters) { \
static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
} \
BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1)
...
...
@@ -90,10 +89,8 @@ static void OpenclConcatHelper(int iters,
}
}
#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE( \
int iters) { \
static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \
std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
...
...
mace/ops/concat_test.cc
浏览文件 @
4410ecd2
...
...
@@ -112,8 +112,8 @@ TEST_F(ConcatOpTest, CPURandom) {
concat_axis_size
+=
input_shapes
[
i
][
axis
];
GenerateRandomRealTypeData
(
input_shapes
[
i
],
inputs
[
i
]);
input_ptrs
[
i
]
=
inputs
[
i
].
data
();
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
MakeString
(
"Input"
,
i
),
input_shapes
[
i
],
inputs
[
i
]);
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
MakeString
(
"Input"
,
i
),
input_shapes
[
i
],
inputs
[
i
]);
}
// Run
...
...
@@ -214,6 +214,6 @@ TEST_F(ConcatOpTest, OPENCLUnAligned) {
}
TEST_F
(
ConcatOpTest
,
OPENCLAlignedMultiInput
)
{
OpenclRandomTest
<
float
>
(
{{
3
,
32
,
32
,
32
},
{
3
,
32
,
32
,
32
},
{
3
,
32
,
32
,
32
},
{
3
,
32
,
32
,
32
}},
3
);
OpenclRandomTest
<
float
>
(
{{
3
,
32
,
32
,
32
},
{
3
,
32
,
32
,
32
},
{
3
,
32
,
32
,
32
},
{
3
,
32
,
32
,
32
}},
3
);
}
\ No newline at end of file
mace/ops/conv_2d_test.cc
浏览文件 @
4410ecd2
...
...
@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <fstream>
#include "mace/ops/conv_2d.h"
#include <fstream>
#include "mace/ops/ops_test_util.h"
using
namespace
mace
;
...
...
@@ -342,7 +342,8 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F
(
Conv2dOpTest
,
OPENCLConv1x1
)
{
TestConv1x1
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
,
typename
T
>
static
void
TestComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
stride
)
{
static
void
TestComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
stride
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
...
...
@@ -412,27 +413,21 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int s
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedConvNxNS12
)
{
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
32
,
16
,
16
,
32
},
1
);
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
32
,
16
,
16
,
32
},
2
);
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
32
,
16
,
16
,
32
},
1
);
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
32
,
16
,
16
,
32
},
2
);
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedConvNxNS12
)
{
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
17
,
113
,
5
,
7
},
1
);
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
17
,
113
,
5
,
7
},
2
);
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
17
,
113
,
5
,
7
},
1
);
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
17
,
113
,
5
,
7
},
2
);
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedConvNxNS34
)
{
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
31
,
113
,
13
,
17
},
3
);
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
32
,
32
,
13
,
17
},
4
);
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
31
,
113
,
13
,
17
},
3
);
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
32
,
32
,
13
,
17
},
4
);
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
static
void
TestHalfComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
index_t
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
dilations
)
{
...
...
@@ -519,67 +514,58 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv1x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
1
,
1
,
32
,
64
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
1
,
1
,
32
,
64
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv3x3S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
3
,
3
,
32
,
64
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
3
,
3
,
32
,
64
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv15x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
15
,
1
,
256
,
2
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
15
,
1
,
256
,
2
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv1x15S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
1
,
15
,
256
,
2
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
1
,
15
,
256
,
2
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv7x75S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
7
,
7
,
3
,
64
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
7
,
7
,
3
,
64
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfUnalignedConv1x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
},
{
1
,
1
,
5
,
7
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
},
{
1
,
1
,
5
,
7
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfUnalignedConv3x3S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
},
{
3
,
3
,
5
,
7
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
},
{
3
,
3
,
5
,
7
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfConv5x5Dilation2
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
64
,
64
},
{
5
,
5
,
16
,
16
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
64
,
64
},
{
5
,
5
,
16
,
16
},
{
2
,
2
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfConv7x7Dilation2
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
64
,
64
},
{
7
,
7
,
16
,
16
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
64
,
64
},
{
7
,
7
,
16
,
16
},
{
2
,
2
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfConv7x7Dilation4
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
63
,
67
},
{
7
,
7
,
16
,
16
},
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
63
,
67
},
{
7
,
7
,
16
,
16
},
{
4
,
4
});
}
template
<
DeviceType
D
,
typename
T
>
static
void
TestDilationConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
dilation_rate
)
{
template
<
DeviceType
D
,
typename
T
>
static
void
TestDilationConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
dilation_rate
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
...
...
@@ -617,9 +603,12 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
...
...
@@ -634,7 +623,8 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.001
);
};
...
...
@@ -647,22 +637,20 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedDilation2
)
{
TestDilationConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
32
,
32
,
32
,
64
},
2
);
TestDilationConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
32
,
32
,
32
,
64
},
2
);
}
TEST_F
(
Conv2dOpTest
,
OPENCLAligned2Dilation4
)
{
TestDilationConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
128
,
128
,
16
,
16
},
4
);
TestDilationConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
128
,
128
,
16
,
16
},
4
);
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedDilation4
)
{
TestDilationConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
107
,
113
,
5
,
7
},
4
);
TestDilationConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
107
,
113
,
5
,
7
},
4
);
}
template
<
DeviceType
D
,
typename
T
>
static
void
TestArbitraryPadConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
std
::
vector
<
int
>
&
paddings
)
{
template
<
DeviceType
D
,
typename
T
>
static
void
TestArbitraryPadConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
std
::
vector
<
int
>
&
paddings
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
)
{
srand
(
time
(
NULL
));
...
...
@@ -698,9 +686,12 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
...
...
@@ -714,7 +705,8 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.001
);
};
...
...
@@ -726,8 +718,7 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedPad1
)
{
TestArbitraryPadConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
32
,
32
,
32
,
64
},
{
1
,
1
});
TestArbitraryPadConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
32
,
32
,
32
,
64
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedPad2
)
{
...
...
@@ -736,6 +727,5 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedPad4
)
{
TestArbitraryPadConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
107
,
113
,
5
,
7
},
{
4
,
4
});
TestArbitraryPadConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
107
,
113
,
5
,
7
},
{
4
,
4
});
}
mace/ops/eltwise.h
浏览文件 @
4410ecd2
...
...
@@ -18,15 +18,17 @@ class EltwiseOp : public Operator<D, T> {
functor_
(
static_cast
<
kernels
::
EltwiseType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"type"
,
static_cast
<
int
>
(
kernels
::
EltwiseType
::
SUM
))),
OperatorBase
::
GetRepeatedArgument
<
float
>
(
"coeff"
)){}
OperatorBase
::
GetRepeatedArgument
<
float
>
(
"coeff"
))
{}
bool
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input0
=
this
->
Input
(
0
);
const
Tensor
*
input1
=
this
->
Input
(
1
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_CHECK
(
input0
->
dim_size
()
==
input1
->
dim_size
())
<<
"Inputs of Eltwise op must be same shape"
;
for
(
int
i
=
0
;
i
<
input0
->
dim_size
();
++
i
)
{
MACE_CHECK
(
input0
->
dim
(
i
)
==
input1
->
dim
(
i
))
<<
"Inputs of Eltwise op must be same shape"
;
MACE_CHECK
(
input0
->
dim_size
()
==
input1
->
dim_size
())
<<
"Inputs of Eltwise op must be same shape"
;
for
(
int
i
=
0
;
i
<
input0
->
dim_size
();
++
i
)
{
MACE_CHECK
(
input0
->
dim
(
i
)
==
input1
->
dim
(
i
))
<<
"Inputs of Eltwise op must be same shape"
;
}
output
->
ResizeLike
(
input0
);
...
...
mace/ops/eltwise_benchmark.cc
浏览文件 @
4410ecd2
mace/ops/eltwise_test.cc
浏览文件 @
4410ecd2
...
...
@@ -2,15 +2,15 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/eltwise.h"
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
#include "mace/kernels/eltwise.h"
namespace
mace
{
class
EltwiseOpTest
:
public
OpsTestBase
{};
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Simple
(
const
kernels
::
EltwiseType
type
,
const
std
::
vector
<
index_t
>
&
shape
,
const
std
::
vector
<
float
>
&
input0
,
...
...
@@ -36,8 +36,10 @@ void Simple(const kernels::EltwiseType type,
// Run
net
.
RunOp
(
D
);
}
else
{
BufferToImage
<
D
,
half
>
(
net
,
"Input1"
,
"InputImg1"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
half
>
(
net
,
"Input2"
,
"InputImg2"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
half
>
(
net
,
"Input1"
,
"InputImg1"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
half
>
(
net
,
"Input2"
,
"InputImg2"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"InputImg1"
)
.
Input
(
"InputImg2"
)
...
...
@@ -49,7 +51,8 @@ void Simple(const kernels::EltwiseType type,
// Run
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImg"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImg"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
auto
expected
=
CreateTensor
<
float
>
(
shape
,
output
);
...
...
@@ -58,64 +61,42 @@ void Simple(const kernels::EltwiseType type,
}
TEST_F
(
EltwiseOpTest
,
CPUSimple
)
{
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
16
,
25
,
36
});
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
4
,
6
,
8
,
10
,
12
});
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
3
,
6
,
9
,
12
,
15
,
18
},
{
2
,
1
});
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
3
,
3
,
6
,
6
},
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
3
,
6
,
9
,
12
,
15
,
18
},
{
2
,
1
});
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
3
,
3
,
6
,
6
},
{
1
,
2
,
3
,
4
,
6
,
6
});
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
3
,
3
,
6
,
6
},
Simple
<
DeviceType
::
CPU
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
3
,
3
,
6
,
6
},
{
1
,
1
,
3
,
3
,
5
,
6
});
}
TEST_F
(
EltwiseOpTest
,
GPUSimple
)
{
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
16
,
25
,
36
});
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
4
,
6
,
8
,
10
,
12
});
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
3
,
6
,
9
,
12
,
15
,
18
},
{
2
,
1
});
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
3
,
3
,
6
,
6
},
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
3
,
6
,
9
,
12
,
15
,
18
},
{
2
,
1
});
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
3
,
3
,
6
,
6
},
{
1
,
2
,
3
,
4
,
6
,
6
});
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
3
,
3
,
6
,
6
},
Simple
<
DeviceType
::
OPENCL
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
3
,
3
,
6
,
6
},
{
1
,
1
,
3
,
3
,
5
,
6
});
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
RandomTest
(
const
kernels
::
EltwiseType
type
,
const
std
::
vector
<
index_t
>
&
shape
)
{
testing
::
internal
::
LogToStderr
();
...
...
@@ -139,8 +120,10 @@ void RandomTest(const kernels::EltwiseType type,
// Run
net
.
RunOp
();
BufferToImage
<
D
,
T
>
(
net
,
"Input1"
,
"InputImg1"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Input2"
,
"InputImg2"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Input1"
,
"InputImg1"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Input2"
,
"InputImg2"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"InputImg1"
)
.
Input
(
"InputImg2"
)
...
...
@@ -153,12 +136,15 @@ void RandomTest(const kernels::EltwiseType type,
// Run
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImg"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImg"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_FLOAT
)
{
ExpectTensorNear
<
float
>
(
*
net
.
GetTensor
(
"Output"
),
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-3
);
ExpectTensorNear
<
float
>
(
*
net
.
GetTensor
(
"Output"
),
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-3
);
}
else
{
ExpectTensorNear
<
float
>
(
*
net
.
GetTensor
(
"Output"
),
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-1
);
ExpectTensorNear
<
float
>
(
*
net
.
GetTensor
(
"Output"
),
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-1
);
}
}
...
...
mace/ops/folded_batch_norm.cc
浏览文件 @
4410ecd2
...
...
@@ -7,22 +7,19 @@
namespace
mace
{
void
Register_FoldedBatchNorm
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
CPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
OPENCL
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
...
...
mace/ops/folded_batch_norm_test.cc
浏览文件 @
4410ecd2
...
...
@@ -17,7 +17,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
std
::
vector
<
float
>
&
scale
,
std
::
vector
<
float
>
&
offset
)
{
size_t
size
=
gamma
.
size
();
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
scale
[
i
]
=
gamma
[
i
]
/
std
::
sqrt
(
var
[
i
]
+
epsilon
);
offset
[
i
]
=
offset
[
i
]
-
mean
[
i
]
*
scale
[
i
];
}
...
...
mace/ops/fully_connected.h
浏览文件 @
4410ecd2
...
...
@@ -15,8 +15,7 @@ class FullyConnectedOp : public Operator<D, T> {
public:
FullyConnectedOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
operator_def
,
ws
),
functor_
(
kernels
::
StringToActivationType
(
functor_
(
kernels
::
StringToActivationType
(
OperatorBase
::
GetSingleArgument
<
std
::
string
>
(
"activation"
,
"NOOP"
)),
OperatorBase
::
GetSingleArgument
<
float
>
(
"max_limit"
,
0.0
f
))
{}
...
...
mace/ops/fully_connected_benchmark.cc
浏览文件 @
4410ecd2
...
...
@@ -17,7 +17,8 @@ static void FCBenchmark(
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channel
});
net
.
AddRandomInput
<
D
,
float
>
(
"Weight"
,
{
out_channel
,
height
*
width
*
channel
});
net
.
AddRandomInput
<
D
,
float
>
(
"Weight"
,
{
out_channel
,
height
*
width
*
channel
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
out_channel
});
if
(
D
==
DeviceType
::
OPENCL
)
{
...
...
@@ -58,9 +59,12 @@ static void FCBenchmark(
}
#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE(int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t tot = static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t tot = \
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
...
...
mace/ops/fully_connected_test.cc
浏览文件 @
4410ecd2
...
...
@@ -10,7 +10,7 @@ namespace mace {
class
FullyConnectedOpTest
:
public
OpsTestBase
{};
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Simple
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
float
>
&
input_value
,
const
std
::
vector
<
index_t
>
&
weight_shape
,
...
...
@@ -58,83 +58,52 @@ void Simple(const std::vector<index_t> &input_shape,
}
// Check
auto
expected
=
CreateTensor
<
float
>
(
output_shape
,
output_value
);
auto
expected
=
CreateTensor
<
float
>
(
output_shape
,
output_value
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
TEST_F
(
FullyConnectedOpTest
,
SimpleCPU
)
{
Simple
<
DeviceType
::
CPU
>
({
1
,
2
,
2
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
,
8
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
},
{
2
},
{
1
,
1
,
1
,
1
},
{
206
});
Simple
<
DeviceType
::
CPU
>
({
1
,
1
,
2
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
2
,
10
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
},
{
2
},
{
2
,
3
},
{
1
,
1
,
1
,
2
},
{
387
,
3853
});
Simple
<
DeviceType
::
CPU
>
({
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
92
,
912
,
94
,
914
,
96
});
Simple
<
DeviceType
::
CPU
>
({
1
,
2
,
2
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
,
8
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
},
{
2
},
{
1
,
1
,
1
,
1
},
{
206
});
Simple
<
DeviceType
::
CPU
>
(
{
1
,
1
,
2
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
2
,
10
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
},
{
2
},
{
2
,
3
},
{
1
,
1
,
1
,
2
},
{
387
,
3853
});
Simple
<
DeviceType
::
CPU
>
(
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
92
,
912
,
94
,
914
,
96
});
}
TEST_F
(
FullyConnectedOpTest
,
SimpleCPUWithBatch
)
{
Simple
<
DeviceType
::
CPU
>
({
2
,
1
,
2
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
,
4
},
{
1
,
2
,
3
,
4
},
{
1
},
{
2
},
{
2
,
1
,
1
,
1
},
{
32
,
72
});
Simple
<
DeviceType
::
CPU
>
({
2
,
1
,
2
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
,
4
},
{
1
,
2
,
3
,
4
},
{
1
},
{
2
},
{
2
,
1
,
1
,
1
},
{
32
,
72
});
}
TEST_F
(
FullyConnectedOpTest
,
SimpleOPENCL
)
{
Simple
<
DeviceType
::
OPENCL
>
({
1
,
2
,
2
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
,
8
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
},
{
2
},
{
1
,
1
,
1
,
1
},
{
206
});
Simple
<
DeviceType
::
OPENCL
>
({
1
,
1
,
2
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
2
,
10
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
},
{
2
},
{
2
,
3
},
{
1
,
1
,
1
,
2
},
{
387
,
3853
});
Simple
<
DeviceType
::
OPENCL
>
({
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
92
,
912
,
94
,
914
,
96
});
Simple
<
DeviceType
::
OPENCL
>
({
1
,
2
,
2
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
,
8
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
},
{
2
},
{
1
,
1
,
1
,
1
},
{
206
});
Simple
<
DeviceType
::
OPENCL
>
(
{
1
,
1
,
2
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
2
,
10
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
},
{
2
},
{
2
,
3
},
{
1
,
1
,
1
,
2
},
{
387
,
3853
});
Simple
<
DeviceType
::
OPENCL
>
(
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
92
,
912
,
94
,
914
,
96
});
}
TEST_F
(
FullyConnectedOpTest
,
SimpleGPUWithBatch
)
{
Simple
<
DeviceType
::
OPENCL
>
({
2
,
1
,
2
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
,
4
},
{
1
,
2
,
3
,
4
},
{
1
},
{
2
},
{
2
,
1
,
1
,
1
},
{
32
,
72
});
Simple
<
DeviceType
::
OPENCL
>
({
2
,
1
,
2
,
2
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
,
4
},
{
1
,
2
,
3
,
4
},
{
1
},
{
2
},
{
2
,
1
,
1
,
1
},
{
32
,
72
});
}
template
<
typename
T
>
template
<
typename
T
>
void
Complex
(
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
...
...
@@ -156,8 +125,7 @@ void Complex(const index_t batch,
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"Weight"
,
{
out_channel
,
height
*
width
*
channels
});
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"Bias"
,
{
out_channel
});
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"Bias"
,
{
out_channel
});
// run cpu
net
.
RunOp
();
...
...
@@ -215,6 +183,4 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex
<
half
>
(
16
,
13
,
12
,
31
,
113
);
Complex
<
half
>
(
31
,
21
,
11
,
23
,
103
);
}
}
mace/ops/fused_conv_2d_test.cc
浏览文件 @
4410ecd2
...
...
@@ -511,8 +511,9 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
TestGeneralConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
40
,
40
},
{
15
,
1
,
32
,
64
});
}
template
<
DeviceType
D
,
typename
T
>
static
void
TestAtrousConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
dilation
)
{
template
<
DeviceType
D
,
typename
T
>
static
void
TestAtrousConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
dilation
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
...
...
@@ -550,9 +551,12 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
.
Input
(
"InputImage"
)
...
...
@@ -567,7 +571,8 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.001
);
};
...
...
@@ -591,7 +596,7 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
TestAtrousConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
107
,
113
,
5
,
7
},
2
);
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
static
void
TestGeneralHalfAtrousConv
(
const
std
::
vector
<
index_t
>
&
image_shape
,
const
std
::
vector
<
index_t
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
dilations
)
{
...
...
@@ -620,7 +625,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
...
...
@@ -632,9 +638,12 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
.
Input
(
"InputImage"
)
...
...
@@ -649,7 +658,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.7
);
};
...
...
@@ -658,13 +668,11 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
}
TEST_F
(
FusedConv2dOpTest
,
OPENCL7X7AtrousConvD2
)
{
TestGeneralHalfAtrousConv
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
7
,
7
,
3
,
16
},
TestGeneralHalfAtrousConv
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
7
,
7
,
3
,
16
},
{
2
,
2
});
}
TEST_F
(
FusedConv2dOpTest
,
OPENCL15X15AtrousConvD4
)
{
TestGeneralHalfAtrousConv
<
DeviceType
::
OPENCL
>
({
63
,
71
},
{
15
,
15
,
16
,
16
},
TestGeneralHalfAtrousConv
<
DeviceType
::
OPENCL
>
({
63
,
71
},
{
15
,
15
,
16
,
16
},
{
2
,
2
});
}
mace/ops/global_avg_pooling.h
浏览文件 @
4410ecd2
//DMACE_ENABLE_NEON
//
DMACE_ENABLE_NEON
// Copyright (c) 2017 XiaoMi All rights reserved.
//
...
...
mace/ops/global_avg_pooling_benchmark.cc
浏览文件 @
4410ecd2
mace/ops/image_to_buffer.h
浏览文件 @
4410ecd2
...
...
@@ -11,7 +11,7 @@
namespace
mace
{
template
<
DeviceType
D
,
typename
T
>
class
ImageToBufferOp
:
public
Operator
<
D
,
T
>
{
class
ImageToBufferOp
:
public
Operator
<
D
,
T
>
{
public:
ImageToBufferOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
true
)
{}
...
...
@@ -20,7 +20,8 @@ class ImageToBufferOp: public Operator<D, T> {
const
Tensor
*
input_tensor
=
this
->
Input
(
INPUT
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
kernels
::
BufferType
type
=
static_cast
<
kernels
::
BufferType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
kernels
::
BufferType
type
=
static_cast
<
kernels
::
BufferType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"buffer_type"
,
static_cast
<
int
>
(
kernels
::
CONV2D_FILTER
)));
functor_
(
output
,
type
,
const_cast
<
Tensor
*>
(
input_tensor
),
future
);
return
true
;
...
...
mace/ops/matmul.h
浏览文件 @
4410ecd2
mace/ops/matmul_test.cc
浏览文件 @
4410ecd2
...
...
@@ -10,7 +10,7 @@ namespace mace {
class
MatMulOpTest
:
public
OpsTestBase
{};
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Simple
(
const
std
::
vector
<
index_t
>
&
A_shape
,
const
std
::
vector
<
float
>
&
A_value
,
const
std
::
vector
<
index_t
>
&
B_shape
,
...
...
@@ -51,29 +51,24 @@ void Simple(const std::vector<index_t> &A_shape,
}
// Check
auto
expected
=
CreateTensor
<
float
>
(
C_shape
,
C_value
);
auto
expected
=
CreateTensor
<
float
>
(
C_shape
,
C_value
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
TEST_F
(
MatMulOpTest
,
SimpleCPU
)
{
Simple
<
DeviceType
::
CPU
>
({
1
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
CPU
>
({
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
215
,
230
,
245
,
260
,
275
,
490
,
530
,
570
,
610
,
650
,
765
,
830
,
895
,
960
,
1025
,
1040
,
1130
,
1220
,
1310
,
1400
,
1315
,
1430
,
1545
,
1660
,
1775
});
Simple
<
DeviceType
::
CPU
>
({
1
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
CPU
>
(
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
215
,
230
,
245
,
260
,
275
,
490
,
530
,
570
,
610
,
650
,
765
,
830
,
895
,
960
,
1025
,
1040
,
1130
,
1220
,
1310
,
1400
,
1315
,
1430
,
1545
,
1660
,
1775
});
}
TEST_F
(
MatMulOpTest
,
SimpleCPUWithBatch
)
{
Simple
<
DeviceType
::
CPU
>
({
2
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
,
3
,
4
,
5
,
6
},
...
...
@@ -81,19 +76,17 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
}
TEST_F
(
MatMulOpTest
,
SimpleOPENCL
)
{
Simple
<
DeviceType
::
OPENCL
>
({
1
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
OPENCL
>
({
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
215
,
230
,
245
,
260
,
275
,
490
,
530
,
570
,
610
,
650
,
765
,
830
,
895
,
960
,
1025
,
1040
,
1130
,
1220
,
1310
,
1400
,
1315
,
1430
,
1545
,
1660
,
1775
});
Simple
<
DeviceType
::
OPENCL
>
({
1
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
OPENCL
>
(
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
{
1
,
5
,
5
,
1
},
{
215
,
230
,
245
,
260
,
275
,
490
,
530
,
570
,
610
,
650
,
765
,
830
,
895
,
960
,
1025
,
1040
,
1130
,
1220
,
1310
,
1400
,
1315
,
1430
,
1545
,
1660
,
1775
});
}
TEST_F
(
MatMulOpTest
,
SimpleGPUWithBatch
)
{
...
...
@@ -118,8 +111,8 @@ void Complex(const index_t batch,
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"A"
,
{
batch
,
height
,
channels
,
1
});
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"A"
,
{
batch
,
height
,
channels
,
1
});
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"B"
,
{
batch
,
channels
,
out_width
,
1
});
...
...
@@ -177,5 +170,4 @@ TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex
<
half
>
(
16
,
32
,
64
,
64
);
Complex
<
half
>
(
31
,
31
,
61
,
67
);
}
}
mace/ops/ops_test_util.h
浏览文件 @
4410ecd2
...
...
@@ -95,7 +95,7 @@ class OpDefBuilder {
class
OpsTestNet
{
public:
OpsTestNet
()
:
op_registry_
(
new
OperatorRegistry
())
{};
OpsTestNet
()
:
op_registry_
(
new
OperatorRegistry
()){};
template
<
DeviceType
D
,
typename
T
>
void
AddInputFromArray
(
const
std
::
string
&
name
,
...
...
@@ -334,9 +334,8 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
for
(
int
h
=
0
;
h
<
x
.
dim
(
1
);
++
h
)
{
for
(
int
w
=
0
;
w
<
x
.
dim
(
2
);
++
w
)
{
for
(
int
c
=
0
;
c
<
x
.
dim
(
3
);
++
c
)
{
EXPECT_NEAR
(
*
a
,
*
b
,
abs_err
)
<<
"with index = ["
<<
n
<<
", "
<<
h
<<
", "
<<
w
<<
", "
<<
c
<<
"]"
;
EXPECT_NEAR
(
*
a
,
*
b
,
abs_err
)
<<
"with index = ["
<<
n
<<
", "
<<
h
<<
", "
<<
w
<<
", "
<<
c
<<
"]"
;
a
++
;
b
++
;
}
...
...
mace/ops/pooling.h
浏览文件 @
4410ecd2
...
...
@@ -20,8 +20,12 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
pooling_type_
(
static_cast
<
PoolingType
>
(
OperatorBase
::
GetSingleArgument
<
int
>
(
"pooling_type"
,
static_cast
<
int
>
(
AVG
)))),
functor_
(
pooling_type_
,
kernels_
.
data
(),
this
->
strides_
.
data
(),
this
->
padding_type_
,
this
->
paddings_
,
this
->
dilations_
.
data
()){};
functor_
(
pooling_type_
,
kernels_
.
data
(),
this
->
strides_
.
data
(),
this
->
padding_type_
,
this
->
paddings_
,
this
->
dilations_
.
data
()){};
bool
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
...
...
mace/ops/pooling_benchmark.cc
浏览文件 @
4410ecd2
mace/ops/pooling_test.cc
浏览文件 @
4410ecd2
...
...
@@ -198,7 +198,8 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape,
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
...
...
@@ -333,7 +334,8 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
...
...
mace/ops/reshape.h
浏览文件 @
4410ecd2
...
...
@@ -15,7 +15,7 @@ class ReshapeOp : public Operator<D, T> {
public:
ReshapeOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
shape_
(
OperatorBase
::
GetRepeatedArgument
<
int64_t
>
(
"shape"
)){}
shape_
(
OperatorBase
::
GetRepeatedArgument
<
int64_t
>
(
"shape"
))
{}
bool
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
...
...
@@ -38,9 +38,11 @@ class ReshapeOp : public Operator<D, T> {
}
if
(
unknown_idx
!=
-
1
)
{
MACE_CHECK
(
product
!=
0
)
<<
"Cannot infer shape if there is zero shape size."
;
MACE_CHECK
(
product
!=
0
)
<<
"Cannot infer shape if there is zero shape size."
;
const
index_t
missing
=
input
->
size
()
/
product
;
MACE_CHECK
(
missing
*
product
==
input
->
size
())
<<
"Input size not match reshaped tensor size"
;
MACE_CHECK
(
missing
*
product
==
input
->
size
())
<<
"Input size not match reshaped tensor size"
;
out_shape
[
unknown_idx
]
=
missing
;
}
...
...
mace/ops/reshape_test.cc
浏览文件 @
4410ecd2
...
...
@@ -13,7 +13,6 @@ class ReshapeTest : public OpsTestBase {};
void
TestReshape
(
const
std
::
vector
<
index_t
>
&
org_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
std
::
vector
<
index_t
>
&
res_shape
)
{
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Reshape"
,
"ReshapeTest"
)
...
...
mace/ops/softmax.cc
浏览文件 @
4410ecd2
mace/ops/softmax.h
浏览文件 @
4410ecd2
...
...
@@ -14,11 +14,10 @@ template <DeviceType D, class T>
class
SoftmaxOp
:
public
Operator
<
D
,
T
>
{
public:
SoftmaxOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
operator_def
,
ws
)
{
}
:
Operator
<
D
,
T
>
(
operator_def
,
ws
)
{}
bool
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
logits
=
this
->
Input
(
LOGITS
);
const
Tensor
*
logits
=
this
->
Input
(
LOGITS
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
output
->
ResizeLike
(
logits
);
...
...
mace/ops/softmax_test.cc
浏览文件 @
4410ecd2
...
...
@@ -14,7 +14,8 @@ void Simple() {
// Construct graph
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
1
,
2
,
4
},
{
1
,
1
,
1
,
1
,
1
,
2
,
3
,
4
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
1
,
2
,
4
},
{
1
,
1
,
1
,
1
,
1
,
2
,
3
,
4
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
,
float
>
(
net
,
"Input"
,
"InputImage"
,
...
...
@@ -41,18 +42,15 @@ void Simple() {
net
.
RunOp
(
D
);
}
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
2
,
4
},
{
0.25
,
0.25
,
0.25
,
0.25
,
0.0320586
,
0.08714432
,
0.23688282
,
0.64391426
});
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
1
,
2
,
4
},
{
0.25
,
0.25
,
0.25
,
0.25
,
0.0320586
,
0.08714432
,
0.23688282
,
0.64391426
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-7
);
}
TEST_F
(
SoftmaxOpTest
,
CPUSimple
)
{
Simple
<
DeviceType
::
CPU
>
();
}
TEST_F
(
SoftmaxOpTest
,
OPENCLSimple
)
{
Simple
<
DeviceType
::
OPENCL
>
();
}
TEST_F
(
SoftmaxOpTest
,
CPUSimple
)
{
Simple
<
DeviceType
::
CPU
>
();
}
TEST_F
(
SoftmaxOpTest
,
OPENCLSimple
)
{
Simple
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
void
Complex
(
const
std
::
vector
<
index_t
>
&
logits_shape
)
{
...
...
mace/ops/space_to_batch.h
浏览文件 @
4410ecd2
...
...
@@ -12,7 +12,7 @@
namespace
mace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
class
SpaceToBatchNDOp
:
public
Operator
<
D
,
T
>
{
public:
SpaceToBatchNDOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
...
...
@@ -28,17 +28,19 @@ class SpaceToBatchNDOp : public Operator<D, T> {
std
::
vector
<
index_t
>
output_shape
(
4
,
0
);
CalculateOutputShape
(
space_tensor
,
batch_tensor
,
output_shape
.
data
());
functor_
(
const_cast
<
Tensor
*>
(
space_tensor
),
output_shape
,
batch_tensor
,
future
);
functor_
(
const_cast
<
Tensor
*>
(
space_tensor
),
output_shape
,
batch_tensor
,
future
);
return
true
;
}
private:
inline
void
CalculateOutputShape
(
const
Tensor
*
input_tensor
,
Tensor
*
output
,
index_t
*
output_shape
)
{
auto
paddings
=
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"paddings"
,
{
0
,
0
,
0
,
0
});
auto
block_shape
=
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"block_shape"
,
{
1
,
1
});
auto
paddings
=
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"paddings"
,
{
0
,
0
,
0
,
0
});
auto
block_shape
=
OperatorBase
::
GetRepeatedArgument
<
int
>
(
"block_shape"
,
{
1
,
1
});
MACE_CHECK
(
input_tensor
->
dim_size
()
==
4
,
"Input's shape should be 4D"
);
MACE_CHECK
(
block_shape
.
size
()
==
2
,
"Block's shape should be 1D"
);
MACE_CHECK
(
paddings
.
size
()
==
4
,
"Paddings' shape should be 2D"
);
...
...
@@ -46,13 +48,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
const
index_t
block_dims
=
block_shape
.
size
();
index_t
block_shape_product
=
1
;
for
(
uint32_t
block_dim
=
0
;
block_dim
<
block_dims
;
++
block_dim
)
{
MACE_CHECK
(
block_shape
[
block_dim
]
>
1
,
"block_shape's value should be great to 1"
);
MACE_CHECK
(
block_shape
[
block_dim
]
>
1
,
"block_shape's value should be great to 1"
);
const
index_t
block_shape_value
=
block_shape
[
block_dim
];
const
index_t
padded_input_size
=
input_tensor
->
dim
(
block_dim
+
1
)
+
paddings
[
block_dim
*
2
]
+
paddings
[
block_dim
*
2
+
1
];
MACE_CHECK
(
padded_input_size
%
block_shape_value
==
0
,
"padded input "
,
padded_input_size
,
" is not divisible by block_shape"
);
const
index_t
padded_input_size
=
input_tensor
->
dim
(
block_dim
+
1
)
+
paddings
[
block_dim
*
2
]
+
paddings
[
block_dim
*
2
+
1
];
MACE_CHECK
(
padded_input_size
%
block_shape_value
==
0
,
"padded input "
,
padded_input_size
,
" is not divisible by block_shape"
);
block_shape_product
*=
block_shape_value
;
output_shape
[
block_dim
+
1
]
=
padded_input_size
/
block_shape_value
;
}
...
...
mace/ops/space_to_batch_benchmark.cc
浏览文件 @
4410ecd2
mace/ops/winograd_convolution_test.cc
浏览文件 @
4410ecd2
...
...
@@ -4,8 +4,8 @@
#include <fstream>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
namespace
mace
{
...
...
@@ -21,7 +21,9 @@ void TransposeFilter(const std::vector<float> &input,
for
(
index_t
w
=
0
;
w
<
input_shape
[
1
];
++
w
)
{
for
(
index_t
oc
=
0
;
oc
<
input_shape
[
2
];
++
oc
)
{
for
(
index_t
ic
=
0
;
ic
<
input_shape
[
3
];
++
ic
)
{
int
offset
=
((
oc
*
input_shape
[
3
]
+
ic
)
*
input_shape
[
0
]
+
h
)
*
input_shape
[
1
]
+
w
;
int
offset
=
((
oc
*
input_shape
[
3
]
+
ic
)
*
input_shape
[
0
]
+
h
)
*
input_shape
[
1
]
+
w
;
output
[
offset
]
=
*
input_ptr
;
++
input_ptr
;
}
...
...
@@ -30,7 +32,7 @@ void TransposeFilter(const std::vector<float> &input,
}
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
WinogradConvolution
(
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
...
...
@@ -53,8 +55,7 @@ void WinogradConvolution(const index_t batch,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
CONV2D_FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
...
...
@@ -78,8 +79,10 @@ void WinogradConvolution(const index_t batch,
// transform filter
std
::
vector
<
float
>
wino_filter_data
;
TransposeFilter
(
filter_data
,
filter_shape
,
wino_filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"WinoFilterData"
,
{
out_channels
,
in_channels
,
3
,
3
},
wino_filter_data
);
BufferToImage
<
D
,
T
>
(
net
,
"WinoFilterData"
,
"WinoFilter"
,
kernels
::
BufferType
::
WINOGRAD_FILTER
);
net
.
AddInputFromArray
<
D
,
float
>
(
"WinoFilterData"
,
{
out_channels
,
in_channels
,
3
,
3
},
wino_filter_data
);
BufferToImage
<
D
,
T
>
(
net
,
"WinoFilterData"
,
"WinoFilter"
,
kernels
::
BufferType
::
WINOGRAD_FILTER
);
// transform input
OpDefBuilder
(
"WinogradTransform"
,
"WinogradTransformTest"
)
...
...
@@ -126,18 +129,23 @@ void WinogradConvolution(const index_t batch,
}
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolution
)
{
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
1
,
32
,
32
,
32
,
16
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
1
,
32
,
32
,
32
,
16
,
Padding
::
SAME
);
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
1
,
32
,
32
,
32
,
16
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
1
,
32
,
32
,
32
,
16
,
Padding
::
SAME
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolution
)
{
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
1
,
61
,
67
,
31
,
37
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
1
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
1
,
61
,
67
,
31
,
37
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
1
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolution
)
{
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
3
,
64
,
64
,
32
,
32
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
5
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
3
,
64
,
64
,
32
,
32
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
OPENCL
,
float
>
(
5
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
}
}
mace/ops/winograd_inverse_transform.h
浏览文件 @
4410ecd2
...
...
@@ -8,12 +8,12 @@
#include <memory>
#include "mace/core/operator.h"
#include "mace/kernels/winograd_transform.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/winograd_transform.h"
namespace
mace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
class
WinogradInverseTransformOp
:
public
Operator
<
D
,
T
>
{
public:
WinogradInverseTransformOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
...
...
mace/ops/winograd_transform.h
浏览文件 @
4410ecd2
...
...
@@ -12,7 +12,7 @@
namespace
mace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
class
WinogradTransformOp
:
public
Operator
<
D
,
T
>
{
public:
WinogradTransformOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
...
...
mace/ops/winograd_transform_benchmark.cc
浏览文件 @
4410ecd2
...
...
@@ -37,16 +37,14 @@ static void BMWinogradTransform(
}
#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK( \
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, OPENCL);
...
...
mace/public/mace.h
浏览文件 @
4410ecd2
...
...
@@ -6,10 +6,10 @@
#define MACE_CORE_MACE_H_
#include <cstdint>
#include <vector>
#include <string>
#include <memory>
#include <map>
#include <memory>
#include <string>
#include <vector>
namespace
mace
{
...
...
@@ -29,9 +29,7 @@ namespace mace {
(MACE_STR(MACE_MAJOR_VERSION) "." MACE_STR(MACE_MINOR_VERSION) "." MACE_STR( \
MACE_PATCH_VERSION) MACE_VERSION_SUFFIX)
inline
const
char
*
MaceVersion
()
{
return
MACE_VERSION_STRING
;
}
inline
const
char
*
MaceVersion
()
{
return
MACE_VERSION_STRING
;
}
extern
const
char
*
MaceGitVersion
();
...
...
@@ -43,17 +41,9 @@ extern const char *MaceGitVersion();
classname &operator=(const classname &) = delete
#endif
enum
NetMode
{
INIT
=
0
,
NORMAL
=
1
};
enum
NetMode
{
INIT
=
0
,
NORMAL
=
1
};
enum
DeviceType
{
CPU
=
0
,
NEON
=
1
,
OPENCL
=
2
,
HEXAGON
=
3
};
enum
DeviceType
{
CPU
=
0
,
NEON
=
1
,
OPENCL
=
2
,
HEXAGON
=
3
};
enum
DataType
{
DT_INVALID
=
0
,
...
...
@@ -104,6 +94,7 @@ class Argument {
public:
Argument
();
void
CopyFrom
(
const
Argument
&
from
);
public:
const
std
::
string
&
name
()
const
;
void
set_name
(
const
std
::
string
&
value
);
...
...
@@ -147,11 +138,13 @@ class NodeInput {
NodeInput
()
{}
NodeInput
(
int
node_id
,
int
output_port
);
void
CopyFrom
(
const
NodeInput
&
from
);
public:
int
node_id
()
const
;
void
set_node_id
(
int
node_id
);
int
output_port
()
const
;
void
set_output_port
(
int
output_port
);
private:
int
node_id_
;
int
output_port_
;
...
...
@@ -162,8 +155,10 @@ class OutputShape {
OutputShape
();
OutputShape
(
const
std
::
vector
<
int64_t
>
&
dims
);
void
CopyFrom
(
const
OutputShape
&
from
);
public:
const
std
::
vector
<
int64_t
>
&
dims
()
const
;
private:
std
::
vector
<
int64_t
>
dims_
;
};
...
...
@@ -240,10 +235,12 @@ class OperatorDef {
class
MemoryBlock
{
public:
MemoryBlock
(
int
mem_id
,
uint32_t
x
,
uint32_t
y
);
public:
int
mem_id
()
const
;
uint32_t
x
()
const
;
uint32_t
y
()
const
;
private:
int
mem_id_
;
uint32_t
x_
;
...
...
@@ -255,9 +252,9 @@ class MemoryArena {
const
std
::
vector
<
MemoryBlock
>
&
mem_block
()
const
;
std
::
vector
<
MemoryBlock
>
&
mutable_mem_block
();
int
mem_block_size
()
const
;
private:
std
::
vector
<
MemoryBlock
>
mem_block_
;
};
// for hexagon mace-nnlib
...
...
@@ -268,6 +265,7 @@ class InputInfo {
int32_t
max_byte_size
()
const
;
DataType
data_type
()
const
;
const
std
::
vector
<
int32_t
>
&
dims
()
const
;
private:
std
::
string
name_
;
int32_t
node_id_
;
...
...
@@ -285,6 +283,7 @@ class OutputInfo {
void
set_data_type
(
DataType
data_type
);
const
std
::
vector
<
int32_t
>
&
dims
()
const
;
void
set_dims
(
const
std
::
vector
<
int32_t
>
&
dims
);
private:
std
::
string
name_
;
int32_t
node_id_
;
...
...
@@ -299,6 +298,7 @@ class NetDef {
int
op_size
()
const
;
const
OperatorDef
&
op
(
const
int
idx
)
const
;
public:
const
std
::
string
&
name
()
const
;
bool
has_name
()
const
;
...
...
@@ -359,7 +359,6 @@ struct RunMetadata {
std
::
vector
<
OperatorStats
>
op_stats
;
};
class
Workspace
;
class
NetBase
;
class
OperatorRegistry
;
...
...
@@ -374,8 +373,7 @@ struct MaceInputInfo {
class
MaceEngine
{
public:
// Single input and output
explicit
MaceEngine
(
const
NetDef
*
net_def
,
DeviceType
device_type
);
explicit
MaceEngine
(
const
NetDef
*
net_def
,
DeviceType
device_type
);
// Multiple input or output
explicit
MaceEngine
(
const
NetDef
*
net_def
,
DeviceType
device_type
,
...
...
@@ -394,7 +392,7 @@ class MaceEngine {
// Multiple input or output
bool
Run
(
const
std
::
vector
<
MaceInputInfo
>
&
input
,
std
::
map
<
std
::
string
,
float
*>
&
output
,
RunMetadata
*
run_metadata
=
nullptr
);
RunMetadata
*
run_metadata
=
nullptr
);
MaceEngine
(
const
MaceEngine
&
)
=
delete
;
MaceEngine
&
operator
=
(
const
MaceEngine
&
)
=
delete
;
...
...
mace/utils/command_line_flags.h
浏览文件 @
4410ecd2
mace/utils/env_time.h
浏览文件 @
4410ecd2
...
...
@@ -9,7 +9,6 @@
#include <sys/time.h>
#include <time.h>
namespace
mace
{
inline
int64_t
NowMicros
()
{
...
...
mace/utils/logging.h
浏览文件 @
4410ecd2
...
...
@@ -10,8 +10,8 @@
#include <string>
#include <vector>
#include "mace/utils/env_time.h"
#include "mace/public/mace.h"
#include "mace/utils/env_time.h"
#include "mace/utils/string_util.h"
#undef ERROR
...
...
mace/utils/string_util.h
浏览文件 @
4410ecd2
mace/utils/timer.h
浏览文件 @
4410ecd2
...
...
@@ -24,13 +24,9 @@ class WallClockTimer : public Timer {
public:
WallClockTimer
()
:
accumulated_micros_
(
0
)
{}
void
StartTiming
()
override
{
start_micros_
=
NowMicros
();
}
void
StartTiming
()
override
{
start_micros_
=
NowMicros
();
}
void
StopTiming
()
override
{
stop_micros_
=
NowMicros
();
}
void
StopTiming
()
override
{
stop_micros_
=
NowMicros
();
}
void
AccumulateTiming
()
override
{
StopTiming
();
...
...
@@ -43,13 +39,9 @@ class WallClockTimer : public Timer {
accumulated_micros_
=
0
;
}
double
ElapsedMicros
()
override
{
return
stop_micros_
-
start_micros_
;
}
double
ElapsedMicros
()
override
{
return
stop_micros_
-
start_micros_
;
}
double
AccumulatedMicros
()
override
{
return
accumulated_micros_
;
}
double
AccumulatedMicros
()
override
{
return
accumulated_micros_
;
}
private:
double
start_micros_
;
...
...
mace/utils/tuner_test.cc
浏览文件 @
4410ecd2
...
...
@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) {
WallClockTimer
timer
;
std
::
vector
<
unsigned
int
>
default_params
(
1
,
1
);
int
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
int
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
EXPECT_EQ
(
expect
,
res
);
default_params
[
0
]
=
2
;
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
EXPECT_EQ
(
expect
+
1
,
res
);
}
...
...
@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) {
};
// tune
WallClockTimer
timer
;
int
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
*
params_generator
,
TunerFunc
,
&
timer
);
int
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
*
params_generator
,
TunerFunc
,
&
timer
);
EXPECT_EQ
(
expect
,
res
);
// run
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
EXPECT_EQ
(
expect
,
res
);
}
...
...
mace/utils/utils.h
浏览文件 @
4410ecd2
mace/utils/utils_test.cc
浏览文件 @
4410ecd2
...
...
@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) {
WallClockTimer
timer
;
std
::
vector
<
unsigned
int
>
default_params
(
1
,
1
);
int
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
int
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
EXPECT_EQ
(
expect
,
res
);
default_params
[
0
]
=
2
;
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
EXPECT_EQ
(
expect
+
1
,
res
);
}
...
...
@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) {
};
// tune
WallClockTimer
timer
;
int
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
*
params_generator
,
TunerFunc
,
&
timer
);
int
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
*
params_generator
,
TunerFunc
,
&
timer
);
EXPECT_EQ
(
expect
,
res
);
// run
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
res
=
Tuner
<
unsigned
int
>::
Get
()
->
template
TuneOrRun
<
unsigned
int
>(
"SimpleRun"
,
default_params
,
nullptr
,
TunerFunc
,
&
timer
);
EXPECT_EQ
(
expect
,
res
);
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录