Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
as350144
Mace
提交
b1397592
Mace
项目概览
as350144
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
2
Star
1
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b1397592
编写于
12月 12, 2017
作者:
Y
yejianwu
浏览文件
操作
浏览文件
下载
差异文件
fix conflix
上级
79d940af
ecef3596
变更
15
展开全部
显示空白变更内容
内联
并排
Showing
15 changed file
with
626 addition
and
357 deletion
+626
-357
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+42
-8
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+6
-1
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+50
-13
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+6
-1
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+6
-1
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+6
-1
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+55
-18
mace/kernels/opencl/relu_opencl.cc
mace/kernels/opencl/relu_opencl.cc
+6
-1
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+46
-12
mace/python/tools/BUILD
mace/python/tools/BUILD
+1
-0
mace/python/tools/memory_optimizer.py
mace/python/tools/memory_optimizer.py
+4
-17
mace/python/tools/tf_converter_lib.py
mace/python/tools/tf_converter_lib.py
+379
-268
mace/python/tools/tf_dsp_converter_lib.py
mace/python/tools/tf_dsp_converter_lib.py
+1
-0
tools/validate.py
tools/validate.py
+6
-2
tools/validate_gcn.sh
tools/validate_gcn.sh
+12
-14
未找到文件。
mace/kernels/opencl/addn.cc
浏览文件 @
b1397592
...
...
@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -33,8 +34,6 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
built_options
.
emplace
(
"-DINPUT_NUM="
+
ToString
(
input_tensors
.
size
()));
auto
addn_kernel
=
runtime
->
BuildKernel
(
"addn"
,
"addn"
,
built_options
);
const
uint32_t
lws
=
runtime
->
GetKernelMaxWorkGroupSize
(
addn_kernel
);
uint32_t
idx
=
0
;
for
(
auto
input
:
input_tensors
)
{
addn_kernel
.
setArg
(
idx
++
,
...
...
@@ -42,12 +41,47 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
}
addn_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
buffer
())));
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
addn_kernel
);
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
};
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
uint32_t
local_ws
[
2
];
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
width_pixels
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
batch_height_pixels
,
kwg_size
/
local_ws
[
0
]);
return
{{
local_ws
[
0
],
local_ws
[
1
]},
{
kwg_size
/
16
,
16
},
{
kwg_size
/
32
,
32
},
{
kwg_size
/
64
,
64
},
{
kwg_size
/
128
,
128
},
{
kwg_size
/
256
,
256
},
{
kwg_size
,
1
},
{
1
,
kwg_size
}
};
};
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
addn_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
width_pixels
,
batch_height_pixels
),
cl
::
NDRange
(
64
,
16
),
// TODO fix this
nullptr
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"error code: "
<<
error
;
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"addn_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
);
}
template
<
typename
T
>
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
b1397592
...
...
@@ -48,8 +48,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
bm_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
8
,
128
,
1
},
//SNPE size
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
b1397592
...
...
@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -41,21 +42,57 @@ static void Concat2(const Tensor *input0,
concat_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input0
->
dim
(
3
)));
concat_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
buffer
())));
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
concat_kernel
);
uint32_t
lws
[
3
]
=
{
8
,
16
,
8
};
// lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
// lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
// lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blk
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
4
,
15
,
8
},
//SNPE size
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
},
{
7
,
15
,
9
},
{
9
,
7
,
15
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
}};
};
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
concat_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
);
}
template
<
typename
T
>
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
b1397592
...
...
@@ -68,8 +68,13 @@ void Conv1x1(const Tensor *input,
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
15
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width_blocks
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
4
,
15
,
8
},
//SNPE size
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
b1397592
...
...
@@ -60,8 +60,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
15
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width_blocks
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
4
,
15
,
8
},
//SNPE size
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
b1397592
...
...
@@ -62,8 +62,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width_blocks
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
4
,
15
,
8
},
//SNPE size
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
b1397592
...
...
@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -23,11 +24,6 @@ static void Pooling(const Tensor *input,
index_t
channels
=
output
->
dim
(
3
);
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
};
auto
runtime
=
OpenCLRuntime
::
Get
();
std
::
set
<
std
::
string
>
built_options
;
...
...
@@ -44,13 +40,6 @@ static void Pooling(const Tensor *input,
}
auto
pooling_kernel
=
runtime
->
BuildKernel
(
"pooling"
,
"pooling"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
pooling_kernel
);
uint32_t
lws
[
3
];
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
lws
[
0
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
lws
[
0
]
*
lws
[
1
]));
uint32_t
idx
=
0
;
pooling_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input
->
buffer
())));
pooling_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
1
)));
...
...
@@ -62,12 +51,60 @@ static void Pooling(const Tensor *input,
pooling_kernel
.
setArg
(
idx
++
,
pooling_size
);
pooling_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
buffer
())));
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
pooling_kernel
);
std
::
vector
<
uint32_t
>
lws
(
3
,
0
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
lws
[
0
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
lws
[
0
]
*
lws
[
1
]));
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
4
,
15
,
8
},
//SNPE size
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
},
{
7
,
15
,
9
},
{
9
,
7
,
15
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
}};
};
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
pooling_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lw
s
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
param
s
[
2
]),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
error
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
);
}
template
<
typename
T
>
...
...
mace/kernels/opencl/relu_opencl.cc
浏览文件 @
b1397592
...
...
@@ -50,8 +50,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
relu_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
4
,
15
,
8
},
//SNPE size
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
b1397592
...
...
@@ -7,6 +7,7 @@
#include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -44,8 +45,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
auto
rb_kernel
=
runtime
->
BuildKernel
(
"resize_bilinear"
,
"resize_bilinear_nocache"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
rb_kernel
);
uint32_t
idx
=
0
;
rb_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input
->
buffer
())));
rb_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
buffer
())));
...
...
@@ -55,17 +54,52 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
rb_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_width
));
rb_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out_height
));
auto
command_queue
=
runtime
->
command_queue
();
cl_int
error
=
command_queue
.
enqueueNDRangeKernel
(
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
out_height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
rb_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
4
,
15
,
8
},
//SNPE size
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
},
{
1
,
kwg_size
,
1
}};
};
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
rb_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
int32_t
>
(
channel_blocks
),
static_cast
<
int32_t
>
(
out_width
),
static_cast
<
int32_t
>
(
out_height
*
batch
)),
// TODO tuning
cl
::
NDRange
(
1
,
static_cast
<
int32_t
>
(
out_width
>
kwg_size
?
kwg_size
:
out_width
),
1
),
nullptr
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
);
}
template
struct
ResizeBilinearFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/python/tools/BUILD
浏览文件 @
b1397592
...
...
@@ -8,6 +8,7 @@ py_library(
],
srcs_version
=
"PY2AND3"
,
deps
=
[
":memory_optimizer"
,
"//mace/proto:mace_py"
,
],
)
...
...
mace/python/tools/memory_optimizer.py
浏览文件 @
b1397592
...
...
@@ -65,7 +65,7 @@ class MemoryOptimizer(object):
raise
Exception
(
'ref count is less than 0'
)
for
mem
in
self
.
mem_block
:
arena
=
net_def
.
mem_arena
arena
=
self
.
net_def
.
mem_arena
block
=
arena
.
mem_block
.
add
()
block
.
mem_id
=
mem
block
.
x
=
self
.
mem_block
[
mem
][
0
]
...
...
@@ -83,20 +83,7 @@ class MemoryOptimizer(object):
print
(
'origin mem: %d, optimized mem: %d'
,
origin_mem_size
,
optimized_mem_size
)
if
__name__
==
'__main__'
:
model_file
=
sys
.
argv
[
1
]
opt_model_file
=
sys
.
argv
[
2
]
with
open
(
model_file
,
"rb"
)
as
f
:
net_def
=
mace_pb2
.
NetDef
()
net_def
.
ParseFromString
(
f
.
read
())
optimizer
=
MemoryOptimizer
(
net_def
)
optimizer
.
optimize
()
with
open
(
opt_model_file
,
"wb"
)
as
f
:
f
.
write
(
net_def
.
SerializeToString
())
with
open
(
opt_model_file
+
'_txt'
,
"wb"
)
as
f
:
net_def
.
ClearField
(
'tensors'
)
f
.
write
(
str
(
net_def
))
def
optimize_memory
(
net_def
):
mem_optimizer
=
MemoryOptimizer
(
net_def
)
mem_optimizer
.
optimize
()
\ No newline at end of file
mace/python/tools/tf_converter_lib.py
浏览文件 @
b1397592
此差异已折叠。
点击以展开。
mace/python/tools/tf_dsp_converter_lib.py
浏览文件 @
b1397592
...
...
@@ -149,6 +149,7 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
elif
is_node_flatten_reshape
(
first_op
):
op_def
.
type
=
'Flatten'
op_def
.
input
.
extend
([
t
.
name
for
t
in
first_op
.
inputs
])
op_def
.
out_max_byte_size
.
extend
([
max_elem_size
(
out
)
for
out
in
first_op
.
outputs
])
convert_op_outputs
(
op_def
,
first_op
)
elif
dsp_ops
.
has_op
(
first_op
.
type
):
op_def
.
input
.
extend
([
t
.
name
for
t
in
first_op
.
inputs
])
...
...
tools/validate.py
浏览文件 @
b1397592
...
...
@@ -4,6 +4,7 @@ import os
import
os.path
import
tensorflow
as
tf
import
numpy
as
np
from
scipy
import
spatial
from
tensorflow
import
gfile
...
...
@@ -34,9 +35,12 @@ def load_data(file):
def
valid_output
(
out_shape
,
mace_out_file
,
tf_out_value
):
mace_out_value
=
load_data
(
mace_out_file
)
if
mace_out_value
.
size
!=
0
:
similarity
=
(
1
-
spatial
.
distance
.
cosine
(
tf_out_value
.
flat
,
mace_out_value
))
print
'MACE VS TF similarity: '
,
similarity
if
similarity
>
0.999
:
print
'=======================Passed! Haha======================'
mace_out_value
=
mace_out_value
.
reshape
(
out_shape
)
np
.
testing
.
assert_allclose
(
mace_out_value
,
tf_out_value
,
rtol
=
0.05
)
print
'=======================Passed! Haha======================'
else
:
print
'=======================Skip empty node==================='
...
...
@@ -62,7 +66,7 @@ def run_model(input_shape):
input_value
=
input_value
.
reshape
(
input_shape
)
output_value
=
session
.
run
(
output_node
,
feed_dict
=
{
input_node
:
[
input_value
]})
# output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_weigh
t')
output_value
.
astype
(
np
.
float32
).
tofile
(
os
.
path
.
dirname
(
FLAGS
.
input_file
)
+
'/tf_ou
t'
)
return
output_value
def
main
(
unused_args
):
...
...
tools/validate_gcn.sh
浏览文件 @
b1397592
...
...
@@ -2,10 +2,10 @@
# Must run at root dir of mace project.
set
+x
Usage
()
{
echo
'Usage: bash tools/validate_gcn.sh tf_model_
fil
e'
echo
'Usage: bash tools/validate_gcn.sh tf_model_
path image_siz
e'
}
if
[
$#
!=
1
]
;
then
if
[
$#
!=
2
]
;
then
Usage
exit
-1
fi
...
...
@@ -13,18 +13,18 @@ fi
TF_MODEL_FILE_PATH
=
$1
MODEL_DIR
=
$(
dirname
${
TF_MODEL_FILE_PATH
}
)
MACE_MODEL_NAME
=
'mace_model.pb'
MACE_OPT_MODEL_NAME
=
'mace_opt_model.pb'
INPUT_FILE_NAME
=
'model_input'
OUTPUT_FILE_NAME
=
'gcn.out'
OUTPUT_LIST_FILE
=
'gcn.list'
PHONE_DATA_DIR
=
"/data/local/tmp/
${
MACE_MODEL_NAME
}
"
KERNEL_DIR
=
"
${
PHONE_DATA_DIR
}
/cl/"
IMAGE_SIZE
=
$2
# Step 1: Generate input data
echo
"Step 1: Generate input data"
python tools/validate.py
--generate_data
true
--random_seed
1
\
--input_file
=
${
MODEL_DIR
}
/
${
INPUT_FILE_NAME
}
\
--input_shape
=
512,512,3
--input_shape
=
"
${
IMAGE_SIZE
}
,
${
IMAGE_SIZE
}
,3"
# Step 2: convert tf model to mace model
echo
"Step 2: convert tf model to mace model and optimize memory"
...
...
@@ -35,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
--output_node
=
GCN/br_result_2/fcn_br
\
--data_type
=
DT_HALF
\
--runtime
=
gpu
bazel build mace/python/tools:memory_optimizer
bazel-bin/mace/python/tools/memory_optimizer
${
MODEL_DIR
}
/
${
MACE_MODEL_NAME
}
\
${
MODEL_DIR
}
/
${
MACE_OPT_MODEL_NAME
}
# Step 3: Run model on the phone
echo
"Step 3: Run model on the phone"
...
...
@@ -49,21 +45,22 @@ bazel build -c opt --strip always mace/examples:mace_run \
adb shell
"mkdir -p
${
PHONE_DATA_DIR
}
"
adb shell
"mkdir -p
${
KERNEL_DIR
}
"
adb push mace/kernels/opencl/cl/
${
KERNEL_DIR
}
adb push
${
MODEL_DIR
}
/
${
MACE_
OPT_
MODEL_NAME
}
${
PHONE_DATA_DIR
}
adb push mace/kernels/opencl/cl/
*
${
KERNEL_DIR
}
adb push
${
MODEL_DIR
}
/
${
MACE_MODEL_NAME
}
${
PHONE_DATA_DIR
}
adb push
${
MODEL_DIR
}
/
${
INPUT_FILE_NAME
}
${
PHONE_DATA_DIR
}
adb push bazel-bin/mace/examples/mace_run
${
PHONE_DATA_DIR
}
num_threads
=
${
1
:-
4
}
adb </dev/null shell
MACE_RUN_PARAMETER_PATH
=
${
PHONE_DATA_DIR
}
/mace_run.config
\
adb </dev/null shell
MACE_CPP_MIN_VLOG_LEVEL
=
0
\
MACE_RUN_PARAMETER_PATH
=
${
PHONE_DATA_DIR
}
/mace_run.config
\
MACE_KERNEL_PATH
=
$KERNEL_DIR
\
OMP_NUM_THREADS
=
$num_threads
\
${
PHONE_DATA_DIR
}
/mace_run
\
--model
=
${
PHONE_DATA_DIR
}
/
${
MACE_
OPT_
MODEL_NAME
}
\
--model
=
${
PHONE_DATA_DIR
}
/
${
MACE_MODEL_NAME
}
\
--input
=
mace_input_node
\
--output
=
mace_output_node
\
--input_shape
=
1,512,512,3
\
--input_shape
=
"1,
${
IMAGE_SIZE
}
,
${
IMAGE_SIZE
}
,3"
\
--input_file
=
${
PHONE_DATA_DIR
}
/
${
INPUT_FILE_NAME
}
\
--output_file
=
${
PHONE_DATA_DIR
}
/
${
OUTPUT_FILE_NAME
}
\
--device
=
OPENCL
\
...
...
@@ -81,4 +78,5 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
--mace_out_file
${
MODEL_DIR
}
/
${
OUTPUT_FILE_NAME
}
\
--input_node
input
\
--output_node
GCN/br_result_2/fcn_br
\
--output_shape
1,512,512,2
--input_shape
"
${
IMAGE_SIZE
}
,
${
IMAGE_SIZE
}
,3"
\
--output_shape
"1,
${
IMAGE_SIZE
}
,
${
IMAGE_SIZE
}
,2"
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录