Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
a9dce8ec
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
a9dce8ec
编写于
1月 24, 2018
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add block tuning to limit the execution time less than 1ms.
上级
537b4600
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
724 addition
and
319 deletion
+724
-319
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+15
-0
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+16
-12
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+58
-25
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+53
-20
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+58
-25
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+59
-28
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+58
-25
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+58
-26
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+58
-26
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+2
-0
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+59
-27
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+59
-23
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+60
-28
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+60
-22
mace/utils/timer.h
mace/utils/timer.h
+41
-20
mace/utils/tuner.h
mace/utils/tuner.h
+10
-12
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
a9dce8ec
...
@@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() {
...
@@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() {
return
(
stop_nanos_
-
start_nanos_
)
/
1000.0
;
return
(
stop_nanos_
-
start_nanos_
)
/
1000.0
;
}
}
double
OpenCLProfilingTimer
::
AccumulatedMicros
()
{
return
accumulated_micros_
;
}
void
OpenCLProfilingTimer
::
AccumulateTiming
(){
StopTiming
();
accumulated_micros_
+=
(
stop_nanos_
-
start_nanos_
)
/
1000.0
;
}
void
OpenCLProfilingTimer
::
ClearTiming
()
{
start_nanos_
=
0
;
stop_nanos_
=
0
;
accumulated_micros_
=
0
;
}
OpenCLRuntime
*
OpenCLRuntime
::
Global
()
{
OpenCLRuntime
*
OpenCLRuntime
::
Global
()
{
static
OpenCLRuntime
instance
;
static
OpenCLRuntime
instance
;
return
&
instance
;
return
&
instance
;
...
...
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
a9dce8ec
...
@@ -18,16 +18,20 @@
...
@@ -18,16 +18,20 @@
namespace
mace
{
namespace
mace
{
class
OpenCLProfilingTimer
:
public
Timer
{
class
OpenCLProfilingTimer
:
public
Timer
{
public:
public:
explicit
OpenCLProfilingTimer
(
const
cl
::
Event
*
event
)
:
event_
(
event
)
{};
explicit
OpenCLProfilingTimer
(
const
cl
::
Event
*
event
)
:
event_
(
event
),
accumulated_micros_
(
0
)
{};
void
StartTiming
()
override
;
void
StartTiming
()
override
;
void
StopTiming
()
override
;
void
StopTiming
()
override
;
double
ElapsedMicros
()
override
;
void
AccumulateTiming
()
override
;
void
ClearTiming
()
override
;
double
ElapsedMicros
()
override
;
double
AccumulatedMicros
()
override
;
private:
private:
const
cl
::
Event
*
event_
;
const
cl
::
Event
*
event_
;
double
start_nanos_
;
double
start_nanos_
;
double
stop_nanos_
;
double
stop_nanos_
;
double
accumulated_micros_
;
};
};
class
OpenCLRuntime
{
class
OpenCLRuntime
{
...
@@ -40,15 +44,15 @@ class OpenCLRuntime {
...
@@ -40,15 +44,15 @@ class OpenCLRuntime {
void
GetCallStats
(
const
cl
::
Event
&
event
,
CallStats
*
stats
);
void
GetCallStats
(
const
cl
::
Event
&
event
,
CallStats
*
stats
);
uint32_t
GetDeviceMaxWorkGroupSize
();
uint32_t
GetDeviceMaxWorkGroupSize
();
uint32_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
uint32_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
const
std
::
set
<
std
::
string
>
&
build_options
);
private:
private:
OpenCLRuntime
();
OpenCLRuntime
();
~
OpenCLRuntime
();
~
OpenCLRuntime
();
OpenCLRuntime
(
const
OpenCLRuntime
&
)
=
delete
;
OpenCLRuntime
(
const
OpenCLRuntime
&
)
=
delete
;
OpenCLRuntime
&
operator
=
(
const
OpenCLRuntime
&
)
=
delete
;
OpenCLRuntime
&
operator
=
(
const
OpenCLRuntime
&
)
=
delete
;
void
BuildProgram
(
const
std
::
string
&
program_file_name
,
void
BuildProgram
(
const
std
::
string
&
program_file_name
,
const
std
::
string
&
binary_file_name
,
const
std
::
string
&
binary_file_name
,
...
...
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
a9dce8ec
...
@@ -63,7 +63,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -63,7 +63,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
activation_kernel
);
runtime
->
GetKernelMaxWorkGroupSize
(
activation_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
...
@@ -73,33 +73,66 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -73,33 +73,66 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]
,
1
},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
},
// SNPE size
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
activation_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
activation_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
activation_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
activation_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
a9dce8ec
...
@@ -50,33 +50,66 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
...
@@ -50,33 +50,66 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
static_cast
<
uint32_t
>
(
batch_height_pixels
)
static_cast
<
uint32_t
>
(
batch_height_pixels
)
};
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
addn_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
addn_kernel
);
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
};
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
,
1
};
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
uint32_t
local_ws
[
2
];
uint32_t
local_ws
[
2
];
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
width_pixels
,
kwg_size
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
width_pixels
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
batch_height_pixels
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
batch_height_pixels
,
kwg_size
/
local_ws
[
0
]);
return
{{
local_ws
[
0
],
local_ws
[
1
]},
return
{{
local_ws
[
0
],
local_ws
[
1
]
,
1
},
{
local_ws
[
1
],
local_ws
[
0
]},
{
local_ws
[
1
],
local_ws
[
0
]
,
1
},
{
kwg_size
/
4
,
4
},
{
kwg_size
/
4
,
4
,
1
},
{
kwg_size
/
16
,
16
},
{
kwg_size
/
16
,
16
,
1
},
{
kwg_size
/
32
,
32
},
{
kwg_size
/
32
,
32
,
1
},
{
kwg_size
/
64
,
64
},
{
kwg_size
/
64
,
64
,
1
},
{
kwg_size
/
128
,
128
},
{
kwg_size
/
128
,
128
,
1
},
{
kwg_size
/
256
,
256
},
{
kwg_size
/
256
,
256
,
1
},
{
kwg_size
/
512
,
512
},
{
kwg_size
/
512
,
512
,
1
},
{
kwg_size
,
1
},
{
kwg_size
,
1
,
1
},
{
1
,
kwg_size
}
{
1
,
kwg_size
,
1
}
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
addn_kernel
,
cl
::
NullRange
,
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
uint32_t
num_blocks
=
params
.
back
();
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
nullptr
,
&
event
);
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
addn_kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
addn_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
1
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
addn_kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
a9dce8ec
...
@@ -83,7 +83,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -83,7 +83,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
bm_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
bm_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
...
@@ -92,33 +92,66 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -92,33 +92,66 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]
,
1
},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
8
,
128
,
1
},
// SNPE size
{
8
,
128
,
1
,
1
},
// SNPE size
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
bm_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
bm_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
bm_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
bm_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
a9dce8ec
...
@@ -51,42 +51,73 @@ static void Concat2(const Tensor *input0,
...
@@ -51,42 +51,73 @@ static void Concat2(const Tensor *input0,
static_cast
<
uint32_t
>
(
batch
*
height
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
concat_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
concat_kernel
);
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blk
,
kwg_size
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blk
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
return
{{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]
,
1
},
{
local_ws
[
2
],
local_ws
[
1
],
local_ws
[
0
]},
{
local_ws
[
2
],
local_ws
[
1
],
local_ws
[
0
]
,
1
},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
},
//SNPE size
{
4
,
15
,
8
,
1
},
//SNPE size
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
concat_kernel
,
cl
::
NullRange
,
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
uint32_t
num_blocks
=
params
.
back
();
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
nullptr
,
&
event
);
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
concat_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
concat_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
concat_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
a9dce8ec
...
@@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input,
...
@@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input,
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
15
,
8
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
15
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
...
@@ -105,33 +105,66 @@ void Conv1x1(const Tensor *input,
...
@@ -105,33 +105,66 @@ void Conv1x1(const Tensor *input,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]
,
1
},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
},
// SNPE size
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
a9dce8ec
...
@@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input,
...
@@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input,
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
15
,
8
};
std
::
vector
<
uint32_t
>
lws
=
{
4
,
15
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
...
@@ -103,34 +103,66 @@ static void Conv2d3x3S12(const Tensor *input,
...
@@ -103,34 +103,66 @@ static void Conv2d3x3S12(const Tensor *input,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
local_ws
[
2
],
local_ws
[
1
],
local_ws
[
0
]},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
,
1
},
{
3
,
15
,
9
},
{
7
,
15
,
9
,
1
},
{
7
,
15
,
9
},
{
9
,
7
,
15
,
1
},
{
9
,
7
,
15
},
{
15
,
7
,
9
,
1
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
{
4
,
15
,
8
},
// SNPE size
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
a9dce8ec
...
@@ -96,7 +96,7 @@ void Conv2dOpencl(const Tensor *input,
...
@@ -96,7 +96,7 @@ void Conv2dOpencl(const Tensor *input,
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
...
@@ -105,34 +105,66 @@ void Conv2dOpencl(const Tensor *input,
...
@@ -105,34 +105,66 @@ void Conv2dOpencl(const Tensor *input,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
local_ws
[
2
],
local_ws
[
1
],
local_ws
[
0
]},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
,
1
},
{
3
,
15
,
9
},
{
7
,
15
,
9
,
1
},
{
7
,
15
,
9
},
{
9
,
7
,
15
,
1
},
{
9
,
7
,
15
},
{
15
,
7
,
9
,
1
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
{
4
,
15
,
8
},
// SNPE size
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
...
...
mace/kernels/opencl/helper.h
浏览文件 @
a9dce8ec
...
@@ -14,6 +14,8 @@
...
@@ -14,6 +14,8 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
const
float
kMaxKernelExeTime
=
1000.0
;
// microseconds
enum
BufferType
{
enum
BufferType
{
FILTER
=
0
,
FILTER
=
0
,
IN_OUT
=
1
,
IN_OUT
=
1
,
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
a9dce8ec
...
@@ -60,7 +60,7 @@ static void Pooling(const Tensor *input,
...
@@ -60,7 +60,7 @@ static void Pooling(const Tensor *input,
static_cast
<
uint32_t
>
(
batch
*
out_height
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
};
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
pooling_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
pooling_kernel
);
std
::
vector
<
uint32_t
>
lws
(
3
,
0
);
std
::
vector
<
uint32_t
>
lws
(
4
,
1
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
lws
[
0
]);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
lws
[
0
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
lws
[
0
]
*
lws
[
1
]));
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
lws
[
0
]
*
lws
[
1
]));
...
@@ -69,35 +69,67 @@ static void Pooling(const Tensor *input,
...
@@ -69,35 +69,67 @@ static void Pooling(const Tensor *input,
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
return
{
{
kwg_size
/
16
,
4
,
4
},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
32
,
4
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
1
,
kwg_size
/
32
,
32
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
128
,
128
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
3
,
15
,
9
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
7
,
15
,
9
},
{
3
,
15
,
9
,
1
},
{
9
,
7
,
15
},
{
7
,
15
,
9
,
1
},
{
15
,
7
,
9
},
{
9
,
7
,
15
,
1
},
{
1
,
kwg_size
,
1
},
{
15
,
7
,
9
,
1
},
{
4
,
15
,
8
},
//SNPE size
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
pooling_kernel
,
cl
::
NullRange
,
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
uint32_t
num_blocks
=
params
.
back
();
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
nullptr
,
&
event
);
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
pooling_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
pooling_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
pooling_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
a9dce8ec
...
@@ -59,38 +59,74 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -59,38 +59,74 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
out_height
*
batch
)};
static_cast
<
uint32_t
>
(
out_height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
rb_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
rb_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
return
{
{
kwg_size
/
16
,
4
,
4
},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
32
,
4
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
1
,
kwg_size
/
32
,
32
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
128
,
128
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
4
,
15
,
8
},
//SNPE size
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
rb_kernel
,
cl
::
NullRange
,
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
uint32_t
num_blocks
=
params
.
back
();
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
nullptr
,
&
event
);
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
rb_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
rb_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
rb_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
a9dce8ec
...
@@ -41,42 +41,74 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
...
@@ -41,42 +41,74 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
softmax_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
softmax_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
4
,
15
,
8
},
//SNPE size
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
local_ws
[
2
],
local_ws
[
1
],
local_ws
[
0
]},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
,
1
},
{
3
,
15
,
9
},
{
7
,
15
,
9
,
1
},
{
7
,
15
,
9
},
{
9
,
7
,
15
,
1
},
{
9
,
7
,
15
},
{
15
,
7
,
9
,
1
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
}};
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
softmax_kernel
,
cl
::
NullRange
,
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
uint32_t
num_blocks
=
params
.
back
();
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
nullptr
,
&
event
);
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
softmax_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
softmax_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
softmax_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
...
...
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
a9dce8ec
...
@@ -61,36 +61,74 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
...
@@ -61,36 +61,74 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
0
)
*
batch_tensor
->
dim
(
1
))};
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
0
)
*
batch_tensor
->
dim
(
1
))};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
s2b_kernel
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
s2b_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
chan_blk
,
kwg_size
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
chan_blk
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
32
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
32
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
32
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
32
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
]},
return
{
{
4
,
32
,
8
},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
4
,
64
,
4
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
4
,
128
,
2
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
8
,
16
,
8
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
8
,
32
,
4
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
8
,
64
,
2
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
16
,
8
,
8
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
16
,
16
,
4
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
16
,
32
,
2
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
32
,
8
,
4
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
32
,
16
,
2
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
64
,
4
,
4
}};
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
=
CL_SUCCESS
;
s2b_kernel
,
cl
::
NullRange
,
if
(
timer
==
nullptr
)
{
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
uint32_t
num_blocks
=
params
.
back
();
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
nullptr
,
&
event
);
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
s2b_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
s2b_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
s2b_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
return
error
;
};
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
...
...
mace/utils/timer.h
浏览文件 @
a9dce8ec
...
@@ -10,29 +10,50 @@
...
@@ -10,29 +10,50 @@
namespace
mace
{
namespace
mace
{
class
Timer
{
class
Timer
{
public:
public:
virtual
void
StartTiming
()
=
0
;
virtual
void
StartTiming
()
=
0
;
virtual
void
StopTiming
()
=
0
;
virtual
void
StopTiming
()
=
0
;
virtual
double
ElapsedMicros
()
=
0
;
virtual
void
AccumulateTiming
()
=
0
;
virtual
void
ClearTiming
()
=
0
;
virtual
double
ElapsedMicros
()
=
0
;
virtual
double
AccumulatedMicros
()
=
0
;
};
};
class
WallClockTimer
:
public
Timer
{
class
WallClockTimer
:
public
Timer
{
public:
public:
void
StartTiming
()
override
{
WallClockTimer
()
:
accumulated_micros_
(
0
)
{}
start_micros_
=
mace
::
utils
::
NowMicros
();
}
void
StartTiming
()
override
{
start_micros_
=
mace
::
utils
::
NowMicros
();
void
StopTiming
()
override
{
}
stop_micros_
=
mace
::
utils
::
NowMicros
();
}
void
StopTiming
()
override
{
stop_micros_
=
mace
::
utils
::
NowMicros
();
double
ElapsedMicros
()
override
{
}
return
stop_micros_
-
start_micros_
;
}
void
AccumulateTiming
()
override
{
StopTiming
();
private:
accumulated_micros_
+=
stop_micros_
-
start_micros_
;
double
start_micros_
;
}
double
stop_micros_
;
void
ClearTiming
()
override
{
start_micros_
=
0
;
stop_micros_
=
0
;
accumulated_micros_
=
0
;
}
double
ElapsedMicros
()
override
{
return
stop_micros_
-
start_micros_
;
}
double
AccumulatedMicros
()
override
{
return
accumulated_micros_
;
}
private:
double
start_micros_
;
double
stop_micros_
;
double
accumulated_micros_
;
};
};
}
// namespace mace
}
// namespace mace
...
...
mace/utils/tuner.h
浏览文件 @
a9dce8ec
...
@@ -41,10 +41,10 @@ class Tuner {
...
@@ -41,10 +41,10 @@ class Tuner {
template
<
typename
RetType
>
template
<
typename
RetType
>
RetType
TuneOrRun
(
RetType
TuneOrRun
(
const
std
::
string
param_key
,
const
std
::
string
param_key
,
const
std
::
vector
<
param_type
>
&
default_param
,
std
::
vector
<
param_type
>
&
default_param
,
const
std
::
function
<
std
::
vector
<
std
::
vector
<
param_type
>>
()
>
const
std
::
function
<
std
::
vector
<
std
::
vector
<
param_type
>>
()
>
&
param_generator
,
&
param_generator
,
const
std
::
function
<
RetType
(
const
std
::
vector
<
param_type
>
&
)
>
&
func
,
const
std
::
function
<
RetType
(
std
::
vector
<
param_type
>
&
,
Timer
*
)
>
&
func
,
Timer
*
timer
)
{
Timer
*
timer
)
{
std
::
string
obfucated_param_key
=
MACE_OBFUSCATE_SYMBOL
(
param_key
);
std
::
string
obfucated_param_key
=
MACE_OBFUSCATE_SYMBOL
(
param_key
);
if
(
IsTuning
()
&&
param_generator
!=
nullptr
)
{
if
(
IsTuning
()
&&
param_generator
!=
nullptr
)
{
...
@@ -60,12 +60,12 @@ class Tuner {
...
@@ -60,12 +60,12 @@ class Tuner {
if
(
param_table_
.
find
(
obfucated_param_key
)
!=
param_table_
.
end
())
{
if
(
param_table_
.
find
(
obfucated_param_key
)
!=
param_table_
.
end
())
{
VLOG
(
1
)
<<
param_key
<<
": "
VLOG
(
1
)
<<
param_key
<<
": "
<<
internal
::
MakeString
(
param_table_
[
obfucated_param_key
]);
<<
internal
::
MakeString
(
param_table_
[
obfucated_param_key
]);
return
func
(
param_table_
[
obfucated_param_key
]);
return
func
(
param_table_
[
obfucated_param_key
]
,
nullptr
);
}
else
{
}
else
{
#ifndef MACE_DISABLE_NO_TUNING_WARNING
#ifndef MACE_DISABLE_NO_TUNING_WARNING
LOG
(
WARNING
)
<<
"Fallback to default parameter: "
<<
param_key
;
LOG
(
WARNING
)
<<
"Fallback to default parameter: "
<<
param_key
;
#endif
#endif
return
func
(
default_param
);
return
func
(
default_param
,
nullptr
);
}
}
}
}
}
}
...
@@ -119,18 +119,16 @@ class Tuner {
...
@@ -119,18 +119,16 @@ class Tuner {
template
<
typename
RetType
>
template
<
typename
RetType
>
inline
RetType
Run
(
inline
RetType
Run
(
const
std
::
function
<
RetType
(
const
std
::
vector
<
param_type
>
&
)
>
&
func
,
const
std
::
function
<
RetType
(
std
::
vector
<
param_type
>
&
,
Timer
*
)
>
&
func
,
const
std
::
vector
<
param_type
>
&
params
,
std
::
vector
<
param_type
>
&
params
,
Timer
*
timer
,
Timer
*
timer
,
int
num_runs
,
int
num_runs
,
double
*
time_us
)
{
double
*
time_us
)
{
RetType
res
;
RetType
res
;
int64_t
total_time_us
=
0
;
int64_t
total_time_us
=
0
;
for
(
int
i
=
0
;
i
<
num_runs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_runs
;
++
i
)
{
timer
->
StartTiming
();
res
=
func
(
params
,
timer
);
res
=
func
(
params
);
total_time_us
+=
timer
->
AccumulatedMicros
();
timer
->
StopTiming
();
total_time_us
+=
timer
->
ElapsedMicros
();
}
}
*
time_us
=
total_time_us
*
1.0
/
num_runs
;
*
time_us
=
total_time_us
*
1.0
/
num_runs
;
...
@@ -141,13 +139,13 @@ class Tuner {
...
@@ -141,13 +139,13 @@ class Tuner {
inline
RetType
Tune
(
inline
RetType
Tune
(
const
std
::
function
<
std
::
vector
<
std
::
vector
<
param_type
>>
()
>
const
std
::
function
<
std
::
vector
<
std
::
vector
<
param_type
>>
()
>
&
param_generator
,
&
param_generator
,
const
std
::
function
<
RetType
(
const
std
::
vector
<
param_type
>
&
)
>
&
func
,
const
std
::
function
<
RetType
(
std
::
vector
<
param_type
>
&
,
Timer
*
)
>
&
func
,
Timer
*
timer
,
Timer
*
timer
,
std
::
vector
<
param_type
>
*
opt_params
)
{
std
::
vector
<
param_type
>
*
opt_params
)
{
RetType
res
;
RetType
res
;
double
opt_time
=
std
::
numeric_limits
<
double
>::
max
();
double
opt_time
=
std
::
numeric_limits
<
double
>::
max
();
auto
params
=
param_generator
();
auto
params
=
param_generator
();
for
(
const
auto
&
param
:
params
)
{
for
(
auto
param
:
params
)
{
double
tmp_time
=
0.0
;
double
tmp_time
=
0.0
;
// warm up
// warm up
Run
<
RetType
>
(
func
,
param
,
timer
,
2
,
&
tmp_time
);
Run
<
RetType
>
(
func
,
param
,
timer
,
2
,
&
tmp_time
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录