Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
野马c
Mace
提交
e4455322
Mace
项目概览
野马c
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
e4455322
编写于
11月 02, 2017
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add opencl kernel with image example
上级
546926a1
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
258 addition
and
63 deletion
+258
-63
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+4
-2
mace/core/runtime/opencl/opencl_wrapper.cc
mace/core/runtime/opencl/opencl_wrapper.cc
+34
-4
mace/kernels/opencl/cl/common.h
mace/kernels/opencl/cl/common.h
+2
-0
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+94
-29
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+124
-28
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
e4455322
...
...
@@ -26,6 +26,7 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
std
::
string
line
;
while
(
std
::
getline
(
ifs
,
line
))
{
*
content
+=
line
;
*
content
+=
"
\n
"
;
}
ifs
.
close
();
return
true
;
...
...
@@ -66,14 +67,15 @@ bool BuildProgram(OpenCLRuntime *runtime,
*
program
=
cl
::
Program
(
runtime
->
context
(),
sources
);
std
::
string
build_options
=
"-Werror -cl-mad-enable -cl-fast-relaxed-math -I"
+
path
;
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
if
(
program
->
build
({
runtime
->
device
()},
build_options
.
c_str
())
!=
CL_SUCCESS
)
{
cl_int
ret
=
program
->
build
({
runtime
->
device
()},
build_options
.
c_str
());
if
(
ret
!=
CL_SUCCESS
)
{
if
(
program
->
getBuildInfo
<
CL_PROGRAM_BUILD_STATUS
>
(
runtime
->
device
())
==
CL_BUILD_ERROR
)
{
std
::
string
build_log
=
program
->
getBuildInfo
<
CL_PROGRAM_BUILD_LOG
>
(
runtime
->
device
());
LOG
(
INFO
)
<<
"Program build log: "
<<
build_log
;
}
LOG
(
FATAL
)
<<
"Build program failed
"
;
LOG
(
FATAL
)
<<
"Build program failed
: "
<<
ret
;
}
return
true
;
...
...
mace/core/runtime/opencl/opencl_wrapper.cc
浏览文件 @
e4455322
...
...
@@ -126,6 +126,12 @@ class OpenCLLibraryImpl final {
using
clRetainKernelFunc
=
cl_int
(
*
)(
cl_kernel
kernel
);
using
clCreateBufferFunc
=
cl_mem
(
*
)(
cl_context
,
cl_mem_flags
,
size_t
,
void
*
,
cl_int
*
);
using
clCreateImageFunc
=
cl_mem
(
*
)(
cl_context
,
cl_mem_flags
,
const
cl_image_format
*
,
const
cl_image_desc
*
,
void
*
,
cl_int
*
);
using
clCreateProgramWithSourceFunc
=
cl_program
(
*
)(
cl_context
,
cl_uint
,
const
char
**
,
const
size_t
*
,
cl_int
*
);
using
clReleaseKernelFunc
=
cl_int
(
*
)(
cl_kernel
kernel
);
...
...
@@ -136,8 +142,12 @@ class OpenCLLibraryImpl final {
using
clRetainDeviceFunc
=
cl_int
(
*
)(
cl_device_id
);
using
clReleaseDeviceFunc
=
cl_int
(
*
)(
cl_device_id
);
using
clRetainEventFunc
=
cl_int
(
*
)(
cl_event
);
using
clGetKernelWorkGroupInfoFunc
=
cl_int
(
*
)(
cl_kernel
,
cl_device_id
,
cl_kernel_work_group_info
,
size_t
,
void
*
,
size_t
*
);
using
clGetKernelWorkGroupInfoFunc
=
cl_int
(
*
)(
cl_kernel
,
cl_device_id
,
cl_kernel_work_group_info
,
size_t
,
void
*
,
size_t
*
);
#define DEFINE_FUNC_PTR(func) func##Func func = nullptr
...
...
@@ -149,6 +159,7 @@ class OpenCLLibraryImpl final {
DEFINE_FUNC_PTR
(
clReleaseKernel
);
DEFINE_FUNC_PTR
(
clCreateProgramWithSource
);
DEFINE_FUNC_PTR
(
clCreateBuffer
);
DEFINE_FUNC_PTR
(
clCreateImage
);
DEFINE_FUNC_PTR
(
clRetainKernel
);
DEFINE_FUNC_PTR
(
clCreateKernel
);
DEFINE_FUNC_PTR
(
clGetProgramInfo
);
...
...
@@ -269,6 +280,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
ASSIGN_FROM_DLSYM
(
clReleaseKernel
);
ASSIGN_FROM_DLSYM
(
clCreateProgramWithSource
);
ASSIGN_FROM_DLSYM
(
clCreateBuffer
);
ASSIGN_FROM_DLSYM
(
clCreateImage
);
ASSIGN_FROM_DLSYM
(
clRetainKernel
);
ASSIGN_FROM_DLSYM
(
clCreateKernel
);
ASSIGN_FROM_DLSYM
(
clGetProgramInfo
);
...
...
@@ -708,6 +720,24 @@ cl_mem clCreateBuffer(cl_context context,
}
}
cl_mem
clCreateImage
(
cl_context
context
,
cl_mem_flags
flags
,
const
cl_image_format
*
image_format
,
const
cl_image_desc
*
image_desc
,
void
*
host_ptr
,
cl_int
*
errcode_ret
)
{
auto
func
=
mace
::
OpenCLLibraryImpl
::
Get
().
clCreateImage
;
if
(
func
!=
nullptr
)
{
return
func
(
context
,
flags
,
image_format
,
image_desc
,
host_ptr
,
errcode_ret
);
}
else
{
if
(
errcode_ret
!=
nullptr
)
{
*
errcode_ret
=
CL_OUT_OF_RESOURCES
;
}
return
nullptr
;
}
}
cl_program
clCreateProgramWithSource
(
cl_context
context
,
cl_uint
count
,
const
char
**
strings
,
...
...
@@ -795,8 +825,8 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
size_t
*
param_value_size_ret
)
{
auto
func
=
mace
::
OpenCLLibraryImpl
::
Get
().
clGetKernelWorkGroupInfo
;
if
(
func
!=
nullptr
)
{
return
func
(
kernel
,
device
,
param_name
,
param_value_size
,
param_value
,
param_value
_size_ret
);
return
func
(
kernel
,
device
,
param_name
,
param_value_size
,
param_value
,
param_value_size_ret
);
}
else
{
return
CL_OUT_OF_RESOURCES
;
}
...
...
mace/kernels/opencl/cl/common.h
0 → 100644
浏览文件 @
e4455322
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
e4455322
void
kernel
conv_2d_1x1_naive
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
global
const
float
*bias,
/*
o
*/
global
float
*output,
/*
n,
c,
h,
w
*/
private
const
int
input_channels
)
{
#
include
<common.h>
__kernel
void
conv_2d_1x1_naive
(
__global
const
float
*input,
/*
n,
c,
h,
w
*/
__global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
__global
const
float
*bias,
/*
o
*/
__global
float
*output,
/*
n,
c,
h,
w
*/
__private
const
int
in_chan_num
)
{
const
int
batch
=
get_global_id
(
0
)
;
const
int
channel
=
get_global_id
(
1
)
;
const
int
channels
=
get_global_size
(
1
)
;
const
int
pixel
=
get_global_id
(
2
)
;
const
int
pixels
=
get_global_size
(
2
)
;
float
*output_ptr
=
output
+
(
batch
*
channels
+
channel
)
*
pixels
;
output_ptr[pixel]
=
bias[channel]
;
for
(
int
inc
=
0
; inc < in
put_channels
; ++inc) {
const
float
*input_ptr
=
input
+
(
batch
*
in
put_channels
+
inc
)
*
pixels
+
pixel
;
const
float
weights
=
filter[channel
*
in
put_channels
+
inc]
;
for
(
int
inc
=
0
; inc < in
_chan_num
; ++inc) {
const
float
*input_ptr
=
input
+
(
batch
*
in
_chan_num
+
inc
)
*
pixels
+
pixel
;
const
float
weights
=
filter[channel
*
in
_chan_num
+
inc]
;
float
in
=
input_ptr[0]
;
float
out
=
output_ptr[0]
;
out
+=
in
*
weights
;
...
...
@@ -23,17 +24,19 @@ void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */
}
}
void
kernel
conv_2d_1x1_v2
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
global
const
float
*bias,
/*
o
*/
global
float
*output,
/*
n,
c,
h,
w
*/
private
const
int
in_chan_num,
private
const
int
out_chan_num,
private
const
int
pixel_num
)
{
__kernel
void
conv_2d_1x1_v2
(
__
global
const
float
*input,
/*
n,
c,
h,
w
*/
__
global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
__
global
const
float
*bias,
/*
o
*/
__
global
float
*output,
/*
n,
c,
h,
w
*/
__
private
const
int
in_chan_num,
__
private
const
int
out_chan_num,
__
private
const
int
pixel_num
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
__private
float
output_slice[4
*
4]
;
const
int
out_chan_begin
=
out_chan_blk
*
4
;
const
int
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
int
out_pixel_begin
=
out_pixel_blk
*
4
;
...
...
@@ -41,21 +44,23 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
const
int
in_offset
=
batch
*
in_chan_num
*
pixel_num
;
const
int
out_offset
=
batch
*
out_chan_num
*
pixel_num
;
const
float
*input_base
=
input
+
in_offset
+
out_pixel_begin
;
float
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
int
pixels
=
out_pixel_end
-
out_pixel_begin
;
int
out_chan_len
=
out_chan_end
-
out_chan_begin
;
int
pixel_len
=
out_pixel_end
-
out_pixel_begin
;
for
(
int
out_chan
=
out_chan_begin
; out_chan < out_chan_end; ++out_chan) {
float
bias_value
=
bias[out_chan]
;
float
*output_ptr
=
output_base
+
out_chan
*
pixel_num
;
for
(
int
p
=
0
; p < pixel
s
; ++p) {
output_
ptr[
p]
=
bias_value
;
int
out_chan_offset
=
out_chan
-
out_chan_begin
;
for
(
int
p
=
0
; p < pixel
_len
; ++p) {
output_
slice[out_chan_offset
*
4
+
p]
=
bias_value
;
}
}
int
in_chan
=
0
;
if
(
pixel
s
==
4
)
{
if
(
pixel
_len
==
4
)
{
for
(
; in_chan + 3 < in_chan_num; in_chan += 4) {
const
float
*input_ptr
=
input_base
+
in_chan
*
pixel_num
;
int
out_chan
=
out_chan_begin
;
...
...
@@ -66,30 +71,32 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
float4
in1
=
vload4
(
0
,
input_ptr
+
pixel_num
)
;
float4
in2
=
vload4
(
0
,
input_ptr
+
2
*
pixel_num
)
;
float4
in3
=
vload4
(
0
,
input_ptr
+
3
*
pixel_num
)
;
#
pragma
unroll
for
(
int
oc
=
0
; oc < 4; ++oc) {
float4
weights
=
vload4
(
0
,
filter_ptr
+
oc
*
in_chan_num
)
;
float4
out
=
vload4
(
0
,
output_ptr
+
oc
*
pixel_num
)
;
float4
out
=
vload4
(
oc,
output_slice
)
;
out
+=
in0
*
weights.x
;
out
+=
in1
*
weights.y
;
out
+=
in2
*
weights.z
;
out
+=
in3
*
weights.w
;
vstore4
(
out,
0
,
output_ptr
+
oc
*
pixel_num
)
;
vstore4
(
out,
oc,
output_slice
)
;
}
}
for
(
; out_chan < out_chan_end; ++out_chan) {
const
float*
filter_ptr
=
filter
+
out_chan
*
in_chan_num
+
in_chan
;
float
*output_ptr
=
output_base
+
out_chan
*
pixel_num
;
int
out_chan_offset
=
out_chan
-
out_chan_begin
;
float4
weights
=
vload4
(
0
,
filter_ptr
)
;
float4
in0
=
vload4
(
0
,
input_ptr
)
;
float4
in1
=
vload4
(
0
,
input_ptr
+
pixel_num
)
;
float4
in2
=
vload4
(
0
,
input_ptr
+
2
*
pixel_num
)
;
float4
in3
=
vload4
(
0
,
input_ptr
+
3
*
pixel_num
)
;
float4
out
=
vload4
(
0
,
output_ptr
)
;
float4
out
=
vload4
(
out_chan_offset,
output_slice
)
;
out
+=
in0
*
weights.x
;
out
+=
in1
*
weights.y
;
out
+=
in2
*
weights.z
;
out
+=
in3
*
weights.w
;
vstore4
(
out,
0
,
output_ptr
)
;
vstore4
(
out,
out_chan_offset,
output_slice
)
;
}
}
}
...
...
@@ -99,13 +106,71 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
for
(
int
out_chan
=
out_chan_begin
; out_chan < out_chan_end; ++out_chan) {
float
weights
=
filter[out_chan
*
in_chan_num
+
in_chan]
;
float
*output_ptr
=
output_base
+
out_chan
*
pixel_num
;
int
out_chan_offset
=
out_chan
-
out_chan_begin
;
for
(
int
p
=
0
; p < pixel
s
; ++p) {
for
(
int
p
=
0
; p < pixel
_len
; ++p) {
float
in
=
input_ptr[p]
;
float
out
=
output_ptr[p]
;
out
+=
in
*
weights
;
output_ptr[p]
=
out
;
output_slice[out_chan_offset
*
4
+
p]
+=
in
*
weights
;
}
}
}
for
(
int
out_chan_offset
=
0
; out_chan_offset < out_chan_len; ++out_chan_offset) {
int
out_chan
=
out_chan_begin
+
out_chan_offset
;
float
*output_ptr
=
output_base
+
out_chan
*
pixel_num
;
if
(
pixel_len
==
4
)
{
float4
out
=
vload4
(
out_chan_offset,
output_slice
)
;
vstore4
(
out,
0
,
output_ptr
)
;
}
else
{
int
offset
=
out_chan_offset
<<
2
;
for
(
int
p
=
0
; p < pixel_len; ++p) {
output_ptr[p]
=
output_slice[offset
+
p]
;
}
}
}
}
/*
FIXME
this
is
incomplete
*/
__kernel
void
conv_2d_1x1_v3
(
__read_only
image3d_t
input,
/*
n,
c/4,
h,
w,
4
*/
__global
const
float
*filter,
/*
o,
i,
kh,
kw
*/
__global
const
float
*bias,
/*
o
*/
__write_only
image3d_t
output,
/*
n,
c/4,
h,
w,
4
*/
__private
const
int
batch_num,
__private
const
int
in_chan_num,
__private
const
int
out_chan_num,
__private
const
int
height,
__private
const
int
width
)
{
int
out_chan_blk
=
get_global_id
(
0
)
;
int
h
=
get_global_id
(
1
)
;
int
w
=
get_global_id
(
2
)
;
int
in_chan_blk_num
=
(
in_chan_num
+
3
)
/
4
;
int
out_chan_blk_num
=
(
out_chan_num
+
3
)
/
4
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
for
(
int
batch
=
0
; batch < batch_num; ++batch) {
float4
bias_value
=
vload4
(
out_chan_blk,
bias
)
;
__private
float4
out
=
bias_value
;
for
(
int
in_chan_blk
=
0
; in_chan_blk < in_chan_blk_num; ++in_chan_blk) {
int
in_d
=
batch
*
in_chan_blk_num
+
in_chan_blk
;
float4
in
=
read_imagef
(
input,
sampler,
(
int4
)(
in_d,
h,
w,
0
))
;
const
float
*filter_base
=
filter
+
(
out_chan_blk
<<
2
)
*
in_chan_num
;
float4
weights
=
vload4
(
in_chan_blk,
filter_base
)
;
out.x
+=
dot
(
in,
weights
)
;
weights
=
vload4
(
in_chan_blk,
filter_base
+
in_chan_num
)
;
out.y
+=
dot
(
in,
weights
)
;
weights
=
vload4
(
in_chan_blk,
filter_base
+
in_chan_num
*
2
)
;
out.z
+=
dot
(
in,
weights
)
;
weights
=
vload4
(
in_chan_blk,
filter_base
+
in_chan_num
*
3
)
;
out.w
+=
dot
(
in,
weights
)
;
}
int
out_d
=
batch
*
out_chan_blk_num
+
out_chan_blk
;
int4
out_coord
=
(
int4
)(
out_d,
h,
w,
0
)
;
write_imagef
(
output,
out_coord,
out
)
;
}
}
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
e4455322
...
...
@@ -2,9 +2,10 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/conv_2d.h"
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/utils/utils.h"
namespace
mace
{
...
...
@@ -22,22 +23,22 @@ void Conv1x1Naive(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
conv_2d
=
cl
::
KernelFunctor
<
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
>
(
program
,
"conv_2d_1x1_naive"
);
auto
conv_2d
=
cl
::
KernelFunctor
<
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
>
(
program
,
"conv_2d_1x1_naive"
);
const
index_t
pixels
=
height
*
width
;
cl_int
error
;
conv_2d
(
cl
::
EnqueueArgs
(
runtime
->
command_queue
(),
cl
::
NDRange
(
static_cast
<
int
>
(
batch
),
static_cast
<
int
>
(
channels
),
static_cast
<
int
>
(
pixels
)),
cl
::
NDRange
(
1
,
1
,
128
)),
conv_2d
(
cl
::
EnqueueArgs
(
runtime
->
command_queue
(
),
cl
::
NDRange
(
static_cast
<
int
>
(
batch
),
static_cast
<
int
>
(
channels
),
static_cast
<
int
>
(
pixels
)),
cl
::
NDRange
(
1
,
1
,
128
)),
*
(
static_cast
<
cl
::
Buffer
*>
(
input
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
filter
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
bias
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())),
static_cast
<
int
>
(
input_channels
),
error
);
static_cast
<
int
>
(
input_channels
),
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
...
...
@@ -53,31 +54,126 @@ void Conv1x1V2(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
conv_2d
=
cl
::
KernelFunctor
<
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
,
int
,
int
>
(
program
,
"conv_2d_1x1_v2"
);
const
index_t
pixels
=
height
*
width
;
const
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
index_t
pixel_blocks
=
(
pixels
+
3
)
/
4
;
// TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
// TODO check wired clReleaseCommandQueue latency
// The KernelFunctor can cause segment faults in cb_retain_event
auto
conv_2d_kernel
=
cl
::
Kernel
(
program
,
"conv_2d_1x1_v2"
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
uint32_t
idx
=
0
;
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_channels
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
channels
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
pixels
));
auto
command_queue
=
runtime
->
command_queue
();
cl_int
error
=
command_queue
.
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
int
>
(
batch
),
static_cast
<
int
>
(
channel_blocks
),
static_cast
<
int
>
(
pixel_blocks
)),
cl
::
NDRange
(
1
,
2
,
kwg_size
/
2
));
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
}
void
Conv1x1V3
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
shape
()[
0
];
const
index_t
channels
=
output
->
shape
()[
1
];
const
index_t
height
=
output
->
shape
()[
2
];
const
index_t
width
=
output
->
shape
()[
3
];
const
index_t
input_channels
=
input
->
shape
()[
1
];
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
const
index_t
pixels
=
height
*
width
;
const
index_t
pixel_blocks
=
(
pixels
+
3
)
/
4
;
const
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
index_t
input_channel_blocks
=
(
input_channels
+
3
)
/
4
;
// FIXME temp hacking
static
std
::
map
<
std
::
ptrdiff_t
,
cl
::
Image3D
>
input_image_map
;
static
std
::
map
<
std
::
ptrdiff_t
,
cl
::
Image3D
>
output_image_map
;
cl
::
Image3D
input_image
;
cl
::
Image3D
output_image
;
auto
input_iter
=
input_image_map
.
find
(
reinterpret_cast
<
std
::
ptrdiff_t
>
(
input
->
buffer
()));
if
(
input_iter
!=
input_image_map
.
end
())
{
input_image
=
input_iter
->
second
;
}
else
{
// The batch dimension is collapsed with channel
cl_int
error
;
cl
::
Image3D
image
=
cl
::
Image3D
(
OpenCLRuntime
::
Get
()
->
context
(),
CL_MEM_READ_ONLY
|
CL_MEM_ALLOC_HOST_PTR
,
cl
::
ImageFormat
(
CL_RGBA
,
CL_FLOAT
),
height
,
width
,
batch
*
input_channel_blocks
,
0
,
0
,
nullptr
,
&
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
input_image
=
image
;
input_image_map
.
clear
();
input_image_map
.
emplace
(
reinterpret_cast
<
std
::
ptrdiff_t
>
(
input
->
buffer
()),
image
);
}
auto
output_iter
=
output_image_map
.
find
(
reinterpret_cast
<
std
::
ptrdiff_t
>
(
output
->
buffer
()));
if
(
output_iter
!=
output_image_map
.
end
())
{
output_image
=
output_iter
->
second
;
}
else
{
cl_int
error
;
cl
::
Image3D
image
=
cl
::
Image3D
(
OpenCLRuntime
::
Get
()
->
context
(),
CL_MEM_WRITE_ONLY
|
CL_MEM_ALLOC_HOST_PTR
,
cl
::
ImageFormat
(
CL_RGBA
,
CL_FLOAT
),
height
,
width
,
batch
*
channel_blocks
,
0
,
0
,
nullptr
,
&
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
output_image
=
image
;
output_image_map
.
clear
();
output_image_map
.
emplace
(
reinterpret_cast
<
std
::
ptrdiff_t
>
(
output
->
buffer
()),
image
);
}
auto
conv_2d_kernel
=
cl
::
Kernel
(
program
,
"conv_2d_1x1_v3"
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
uint32_t
idx
=
0
;
conv_2d_kernel
.
setArg
(
idx
++
,
input_image
);
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
output_image
);
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
batch
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_channels
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
channels
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
auto
command_queue
=
runtime
->
command_queue
();
cl_int
error
;
conv_2d
(
cl
::
EnqueueArgs
(
runtime
->
command_queue
(),
cl
::
NDRange
(
static_cast
<
int
>
(
batch
),
static_cast
<
int
>
(
channel_blocks
),
static_cast
<
int
>
(
pixel_blocks
)),
cl
::
NDRange
(
1
,
1
,
256
)),
*
(
static_cast
<
cl
::
Buffer
*>
(
input
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
filter
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
bias
->
buffer
())),
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())),
static_cast
<
int
>
(
input_channels
),
static_cast
<
int
>
(
channels
),
static_cast
<
int
>
(
pixels
),
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
error
=
command_queue
.
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
int
>
(
channel_blocks
),
static_cast
<
int
>
(
height
),
static_cast
<
int
>
(
width
)),
cl
::
NDRange
(
1
,
2
,
kwg_size
/
2
));
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
}
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
shape
()[
0
];
const
index_t
height
=
output
->
shape
()[
2
];
const
index_t
width
=
output
->
shape
()[
3
];
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录