Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
98d3c7b9
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
98d3c7b9
编写于
4月 22, 2018
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
New opencl kernel time limit strategy to support opencl 1.1/1.2.
上级
83623605
变更
22
显示空白变更内容
内联
并排
Showing
22 changed file
with
111 addition
and
135 deletion
+111
-135
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+1
-1
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+1
-1
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+1
-1
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+1
-1
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+1
-1
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+1
-1
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+1
-1
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+1
-1
mace/kernels/opencl/cwise_opencl.cc
mace/kernels/opencl/cwise_opencl.cc
+1
-1
mace/kernels/opencl/depth_to_space_opencl.cc
mace/kernels/opencl/depth_to_space_opencl.cc
+1
-1
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+1
-1
mace/kernels/opencl/eltwise_opencl.cc
mace/kernels/opencl/eltwise_opencl.cc
+1
-1
mace/kernels/opencl/fully_connected_opencl.cc
mace/kernels/opencl/fully_connected_opencl.cc
+1
-1
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+88
-113
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+1
-1
mace/kernels/opencl/pad.cc
mace/kernels/opencl/pad.cc
+1
-1
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+1
-1
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+1
-1
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+1
-1
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+1
-1
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+3
-3
mace/python/tools/caffe_converter_lib.py
mace/python/tools/caffe_converter_lib.py
+1
-0
未找到文件。
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -110,7 +110,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
input_shape_
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
98d3c7b9
...
...
@@ -105,7 +105,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input_tensors
[
0
]
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
0
};
std
::
stringstream
ss
;
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
];
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -116,7 +116,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
input_shape_
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel_"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
98d3c7b9
...
...
@@ -90,7 +90,7 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
std
::
stringstream
ss
;
ss
<<
"channel_shuffle_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
98d3c7b9
...
...
@@ -95,7 +95,7 @@ static void Concat2(cl::Kernel *kernel,
*
prev_input_shape
=
input0
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
0
};
std
::
stringstream
ss
;
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
98d3c7b9
...
...
@@ -130,7 +130,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
98d3c7b9
...
...
@@ -128,7 +128,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
*
kwg_size
/
32
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
*
kwg_size
/
32
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
98d3c7b9
...
...
@@ -130,7 +130,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"conv2d_general_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/cwise_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -76,7 +76,7 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
input_shape_
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
0
};
std
::
stringstream
ss
;
ss
<<
"cwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/depth_to_space_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -134,7 +134,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -149,7 +149,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel_"
,
activation
,
batch
,
height
,
width
,
channels
,
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
...
...
mace/kernels/opencl/eltwise_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -85,7 +85,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
input_shape_
=
input0
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
0
};
std
::
stringstream
ss
;
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/fully_connected_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -233,7 +233,7 @@ void FCWTXKernel(cl::Kernel *kernel,
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
*
lws
=
{
16
,
kwg_size
/
16
,
1
};
*
lws
=
{
16
,
kwg_size
/
16
,
0
};
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
const
index_t
batch
=
output
->
dim
(
0
);
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
98d3c7b9
...
...
@@ -223,23 +223,23 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
// TODO(heliangliang): tuning these magic numbers
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
8
,
4
,
1
},
{
kwg_size
/
64
,
8
,
8
,
1
},
{
kwg_size
/
64
,
16
,
4
,
1
},
{
kwg_size
/
128
,
8
,
16
,
1
},
{
kwg_size
/
128
,
16
,
8
,
1
},
{
kwg_size
/
128
,
32
,
4
,
1
},
{
1
,
kwg_size
/
32
,
32
,
1
},
{
1
,
kwg_size
/
64
,
64
,
1
},
{
1
,
kwg_size
/
128
,
128
,
1
},
{
4
,
kwg_size
/
16
,
4
,
1
},
{
4
,
kwg_size
/
28
,
7
,
1
},
{
4
,
kwg_size
/
32
,
8
,
1
},
{
4
,
kwg_size
/
56
,
14
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
0
},
{
kwg_size
/
16
,
4
,
4
,
0
},
{
kwg_size
/
32
,
4
,
8
,
0
},
{
kwg_size
/
32
,
8
,
4
,
0
},
{
kwg_size
/
64
,
8
,
8
,
0
},
{
kwg_size
/
64
,
16
,
4
,
0
},
{
kwg_size
/
128
,
8
,
16
,
0
},
{
kwg_size
/
128
,
16
,
8
,
0
},
{
kwg_size
/
128
,
32
,
4
,
0
},
{
1
,
kwg_size
/
32
,
32
,
0
},
{
1
,
kwg_size
/
64
,
64
,
0
},
{
1
,
kwg_size
/
128
,
128
,
0
},
{
4
,
kwg_size
/
16
,
4
,
0
},
{
4
,
kwg_size
/
28
,
7
,
0
},
{
4
,
kwg_size
/
32
,
8
,
0
},
{
4
,
kwg_size
/
56
,
14
,
0
},
{
1
,
kwg_size
,
1
,
0
},
};
};
cl
::
Event
event
;
...
...
@@ -248,46 +248,35 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
MACE_CHECK
(
params
.
size
()
==
4
)
<<
"Tuning parameters of 3D kernel must be 4D"
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
roundup_gws
(
3
);
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
3
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
roundup
_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
internal
_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
[
3
];
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
uint32_t
block_size
=
params
[
3
]
==
0
?
internal_gws
[
2
]
:
params
[
3
];
const
uint32_t
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
2
],
block_size
)
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
else
{
uint32_t
roundup_gws2
=
RoundUp
(
gws2
,
params
[
2
]);
uint32_t
gws2
=
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
gws2
=
(
internal_gws
[
2
]
-
(
i
*
block_size
));
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_
gws2
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
}
}
else
{
timer
->
ClearTiming
();
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
else
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup
_gws
[
2
]),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
internal
_gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
...
@@ -297,24 +286,22 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
2
]);
(
*
tuning_result
)[
3
]
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
gws
[
2
]
%
num_blocks
>
0
)
num_blocks
++
;
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
block_size
=
RoundUp
(
block_size
,
params
[
2
]);
}
(
*
tuning_result
)[
3
]
=
block_size
;
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
2
],
block_size
);
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
else
{
uint32_t
roundup_gws2
=
RoundUp
(
gws2
,
params
[
2
]);
uint32_t
gws2
=
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
gws2
=
(
internal_gws
[
2
]
-
(
i
*
block_size
));
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_
gws2
),
cl
::
NDRange
(
internal_gws
[
0
],
internal_gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
timer
->
AccumulateTiming
();
}
...
...
@@ -349,16 +336,16 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
uint32_t
local_ws
[
2
];
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
kwg_size
);
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
/
local_ws
[
0
]);
return
{{
local_ws
[
0
],
local_ws
[
1
],
1
},
{
local_ws
[
1
],
local_ws
[
0
],
1
},
{
kwg_size
/
4
,
4
,
1
},
{
kwg_size
/
16
,
16
,
1
},
{
kwg_size
/
32
,
32
,
1
},
{
kwg_size
/
64
,
64
,
1
},
{
kwg_size
/
128
,
128
,
1
},
{
kwg_size
/
256
,
256
,
1
},
{
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
}};
return
{{
local_ws
[
0
],
local_ws
[
1
],
0
},
{
local_ws
[
1
],
local_ws
[
0
],
0
},
{
kwg_size
/
4
,
4
,
0
},
{
kwg_size
/
16
,
16
,
0
},
{
kwg_size
/
32
,
32
,
0
},
{
kwg_size
/
64
,
64
,
0
},
{
kwg_size
/
128
,
128
,
0
},
{
kwg_size
/
256
,
256
,
0
},
{
kwg_size
,
1
,
0
},
{
1
,
kwg_size
,
0
}};
};
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
,
...
...
@@ -366,44 +353,34 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
MACE_CHECK
(
params
.
size
()
==
3
)
<<
"Tuning parameters of 2D kernel must be 3d"
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
roundup_gws
(
2
);
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
2
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
roundup
_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
internal
_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
[
2
];
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
uint32_t
block_size
=
params
[
2
]
==
0
?
internal_gws
[
1
]
:
params
[
2
];
const
uint32_t
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
1
],
block_size
)
;
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
else
{
uint32_t
roundup_gws1
=
RoundUp
(
gws1
,
params
[
1
]);
uint32_t
gws1
=
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
gws1
=
(
internal_gws
[
1
]
-
(
i
*
block_size
));
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_
gws1
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
}
}
else
{
timer
->
ClearTiming
();
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
else
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup
_gws
[
1
]),
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
internal_gws
[
0
],
internal
_gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
...
@@ -413,24 +390,22 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
timer
->
ClearTiming
();
uint32_t
num_blocks
=
std
::
min
(
static_cast
<
uint32_t
>
(
elapse_time
/
kMaxKernelExeTime
)
+
1
,
gws
[
1
]);
(
*
tuning_result
)[
2
]
=
num_blocks
;
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
gws
[
1
]
%
num_blocks
>
0
)
num_blocks
++
;
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
block_size
=
RoundUp
(
block_size
,
params
[
1
]);
}
(
*
tuning_result
)[
2
]
=
block_size
;
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
1
],
block_size
);
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
else
{
uint32_t
roundup_gws1
=
RoundUp
(
gws1
,
params
[
1
]);
uint32_t
gws1
=
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
gws1
=
(
internal_gws
[
1
]
-
(
i
*
block_size
));
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_
gws1
),
cl
::
NDRange
(
internal_gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
timer
->
AccumulateTiming
();
}
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
98d3c7b9
...
...
@@ -84,7 +84,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
))));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
0
};
std
::
stringstream
ss
;
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
...
...
mace/kernels/opencl/pad.cc
浏览文件 @
98d3c7b9
...
...
@@ -100,7 +100,7 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"pad"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -134,7 +134,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
};
}
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
std
::
stringstream
ss
;
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -99,7 +99,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
std
::
stringstream
ss
;
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -81,7 +81,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
input_shape_
=
logits
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
std
::
stringstream
ss
;
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
98d3c7b9
...
...
@@ -105,7 +105,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
space_shape_
=
space_tensor
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
std
::
stringstream
ss
;
ss
<<
kernel_name
<<
"_"
<<
batch_tensor
->
dim
(
0
)
<<
"_"
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
98d3c7b9
...
...
@@ -54,7 +54,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
input_tensor
->
dim
(
3
),
1
};
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
1
,
input_tensor
->
dim
(
3
)
};
std
::
vector
<
int
>
paddings
(
2
);
if
(
paddings_
.
empty
())
{
kernels
::
CalcNHWCPaddingAndOutputSize
(
...
...
@@ -101,7 +101,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input_tensor
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
stringstream
ss
;
ss
<<
"winograd_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
...
...
@@ -215,7 +215,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input_tensor
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
stringstream
ss
;
ss
<<
"winograd_inverse_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
...
...
mace/python/tools/caffe_converter_lib.py
浏览文件 @
98d3c7b9
...
...
@@ -559,6 +559,7 @@ class CaffeConverter(object):
paddings
,
strides
,
_
=
self
.
add_stride_pad_kernel_arg
(
param
,
None
)
filter_shape
=
np
.
asarray
(
op
.
data
[
0
].
shape
)
filter_shape
=
filter_shape
[[
2
,
3
,
0
,
1
]]
# OIHW -> HWOI
input_format
=
'NHWC'
output_shape
=
Shapes
.
conv_pool_shape
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录