Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
faa8459b
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
faa8459b
编写于
1月 24, 2018
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor tuning code.
上级
a9dce8ec
变更
13
显示空白变更内容
内联
并排
Showing
13 changed file
with
205 addition
and
866 deletion
+205
-866
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+1
-75
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+1
-76
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+1
-74
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+1
-83
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+1
-74
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+1
-74
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+1
-74
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+177
-0
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+17
-4
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+1
-83
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+1
-83
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+1
-83
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+1
-83
未找到文件。
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
faa8459b
...
...
@@ -64,84 +64,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
activation_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
activation_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
activation_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
activation_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
string
tuning_key
=
Concat
(
"relu_opencl_kernel_"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
SetFuture
(
future
,
event
);
TuningOrRun3DKernel
(
activation_kernel
,
tuning_key
,
gws
,
lws
,
future
);
}
template
struct
ActivationFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
faa8459b
...
...
@@ -49,89 +49,14 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
addn_kernel
);
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
,
1
};
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
uint32_t
local_ws
[
2
];
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
width_pixels
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
batch_height_pixels
,
kwg_size
/
local_ws
[
0
]);
return
{{
local_ws
[
0
],
local_ws
[
1
],
1
},
{
local_ws
[
1
],
local_ws
[
0
],
1
},
{
kwg_size
/
4
,
4
,
1
},
{
kwg_size
/
16
,
16
,
1
},
{
kwg_size
/
32
,
32
,
1
},
{
kwg_size
/
64
,
64
,
1
},
{
kwg_size
/
128
,
128
,
1
},
{
kwg_size
/
256
,
256
,
1
},
{
kwg_size
/
512
,
512
,
1
},
{
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
}
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
addn_kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
addn_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
1
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
addn_kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"addn_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
,
&
timer
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
TuningOrRun2DKernel
(
addn_kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
<
typename
T
>
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
faa8459b
...
...
@@ -84,83 +84,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
bm_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
8
,
128
,
1
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
bm_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
bm_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
bm_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel_"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
SetFuture
(
future
,
event
);
TuningOrRun3DKernel
(
bm_kernel
,
tuning_key
,
gws
,
lws
,
future
);
}
template
struct
BatchNormFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
faa8459b
...
...
@@ -50,96 +50,14 @@ static void Concat2(const Tensor *input0,
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
concat_kernel
);
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blk
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
local_ws
[
2
],
local_ws
[
1
],
local_ws
[
0
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
//SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
concat_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
concat_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
concat_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
,
&
timer
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
TuningOrRun3DKernel
(
concat_kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
<
typename
T
>
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
faa8459b
...
...
@@ -97,83 +97,10 @@ void Conv1x1(const Tensor *input,
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
15
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width_blocks
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
SetFuture
(
future
,
event
);
TuningOrRun3DKernel
(
conv_2d_kernel
,
tuning_key
,
gws
,
lws
,
future
);
}
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
faa8459b
...
...
@@ -95,83 +95,10 @@ static void Conv2d3x3S12(const Tensor *input,
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
std
::
vector
<
uint32_t
>
lws
=
{
4
,
15
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width_blocks
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
SetFuture
(
future
,
event
);
TuningOrRun3DKernel
(
conv_2d_kernel
,
tuning_key
,
gws
,
lws
,
future
);
}
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
faa8459b
...
...
@@ -97,83 +97,10 @@ void Conv2dOpencl(const Tensor *input,
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width_blocks
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
string
tuning_key
=
Concat
(
"conv2d_general_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
SetFuture
(
future
,
event
);
TuningOrRun3DKernel
(
conv_2d_kernel
,
tuning_key
,
gws
,
lws
,
future
);
}
}
// namespace kernels
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
faa8459b
...
...
@@ -4,6 +4,7 @@
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -100,5 +101,181 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
}
}
void
TuningOrRun3DKernel
(
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
OpenCLRuntime
::
Global
()
->
GetCallStats
(
event
,
stats
);
}
};
}
}
void
TuningOrRun2DKernel
(
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
uint32_t
local_ws
[
2
];
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
/
local_ws
[
0
]);
return
{{
local_ws
[
0
],
local_ws
[
1
],
1
},
{
local_ws
[
1
],
local_ws
[
0
],
1
},
{
kwg_size
/
4
,
4
,
1
},
{
kwg_size
/
16
,
16
,
1
},
{
kwg_size
/
32
,
32
,
1
},
{
kwg_size
/
64
,
64
,
1
},
{
kwg_size
/
128
,
128
,
1
},
{
kwg_size
/
256
,
256
,
1
},
{
kwg_size
/
512
,
512
,
1
},
{
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
}
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
1
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
tuning_key
,
lws
,
params_generator
,
func
,
&
timer
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/helper.h
浏览文件 @
faa8459b
...
...
@@ -18,7 +18,7 @@ const float kMaxKernelExeTime = 1000.0; // microseconds
enum
BufferType
{
FILTER
=
0
,
IN_OUT
=
1
,
IN_OUT
=
1
,
ARGUMENT
=
2
};
...
...
@@ -34,6 +34,19 @@ std::string DtToCLDt(const DataType dt);
std
::
string
DtToUpstreamCLDt
(
const
DataType
dt
);
void
TuningOrRun3DKernel
(
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
void
TuningOrRun2DKernel
(
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
);
inline
void
SetFuture
(
StatsFuture
*
future
,
const
cl
::
Event
&
event
)
{
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
event
](
CallStats
*
stats
)
{
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
faa8459b
...
...
@@ -64,95 +64,13 @@ static void Pooling(const Tensor *input,
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
lws
[
0
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
lws
[
0
]
*
lws
[
1
]));
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
pooling_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
pooling_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
pooling_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
,
&
timer
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
TuningOrRun3DKernel
(
pooling_kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
<
typename
T
>
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
faa8459b
...
...
@@ -60,95 +60,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
out_height
*
batch
)};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
rb_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
out_width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
out_height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
rb_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
rb_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
rb_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
,
&
timer
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
TuningOrRun3DKernel
(
rb_kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
struct
ResizeBilinearFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
faa8459b
...
...
@@ -42,95 +42,13 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
softmax_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blocks
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
softmax_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
softmax_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
softmax_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
,
&
timer
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
TuningOrRun3DKernel
(
softmax_kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
...
...
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
faa8459b
...
...
@@ -62,95 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
0
)
*
batch_tensor
->
dim
(
1
))};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
s2b_kernel
);
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
chan_blk
,
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
32
,
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
32
,
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
};
};
cl
::
Event
event
;
auto
func
=
[
&
](
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
)
->
cl_int
{
cl_int
error
=
CL_SUCCESS
;
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
.
back
();
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
s2b_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
StartTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
s2b_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
StopTiming
();
double
elapse_time
=
timer
->
ElapsedMicros
();
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
params
.
back
()
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
s2b_kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
}
}
return
error
;
};
std
::
stringstream
ss
;
ss
<<
kernel_name
<<
"_"
<<
batch_tensor
->
dim
(
0
)
<<
"_"
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
<<
batch_tensor
->
dim
(
3
);
OpenCLProfilingTimer
timer
(
&
event
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
,
&
timer
);
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
TuningOrRun3DKernel
(
s2b_kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
}
template
struct
SpaceToBatchFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录