提交 1794dae4 编写于 作者: L liuqi

Add more strategy for convolution opencl default lws.

上级 cf5cae14
...@@ -25,7 +25,7 @@ namespace { ...@@ -25,7 +25,7 @@ namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size(); OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -114,7 +114,7 @@ static void Concat2(cl::Kernel *kernel, ...@@ -114,7 +114,7 @@ static void Concat2(cl::Kernel *kernel,
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), Concat("concat_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
......
...@@ -23,16 +23,20 @@ namespace kernels { ...@@ -23,16 +23,20 @@ namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size(); OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) { if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base); lws[0] = std::min<uint32_t>(gws[0], base);
} else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else { } else {
lws[0] = gws[0] / 8; lws[0] = gws[0] / 8;
if (lws[0] < base) { if (lws[0] < base) {
...@@ -165,7 +169,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -165,7 +169,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel", output->dim(0), Concat("conv2d_1x1_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
......
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
...@@ -157,7 +156,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -157,7 +156,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0), Concat("conv2d_3x3_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
......
...@@ -168,7 +168,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -168,7 +168,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std::string tuning_key = std::string tuning_key =
Concat("conv2d_general_opencl_kernel", output->dim(0), Concat("conv2d_general_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3), output->dim(1), output->dim(2), output->dim(3),
filter->dim(0), filter->dim(1)); filter->dim(0), filter->dim(1));
std::vector<uint32_t> lws = std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(0) * filter->dim(1), *kwg_size); LocalWS(gws, filter->dim(0) * filter->dim(1), *kwg_size);
......
...@@ -32,7 +32,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -32,7 +32,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= min_lws0) { if (lws[1] >= min_lws0) {
lws[0] = std::min<uint32_t>(gws[0], min_lws0); lws[0] = std::min<uint32_t>(gws[0], min_lws0);
} else { } else {
lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]); lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
if (lws[0] < min_lws0) { if (lws[0] < min_lws0) {
lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, min_lws0), lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, min_lws0),
......
...@@ -215,7 +215,6 @@ std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws, ...@@ -215,7 +215,6 @@ std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
lws[0] = std::min<uint32_t>(base, kwg_size); lws[0] = std::min<uint32_t>(base, kwg_size);
lws[1] = kwg_size / lws[1]; lws[1] = kwg_size / lws[1];
return lws; return lws;
} }
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
......
...@@ -118,7 +118,6 @@ std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws, ...@@ -118,7 +118,6 @@ std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
const uint32_t kwg_size); const uint32_t kwg_size);
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size); const uint32_t kwg_size);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_ #endif // MACE_KERNELS_OPENCL_HELPER_H_
...@@ -86,7 +86,7 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -86,7 +86,7 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = std::string tuning_key =
Concat("matmul_opencl_kernel", C->dim(0), Concat("matmul_opencl_kernel", C->dim(0),
C->dim(1), C->dim(2), C->dim(3)); C->dim(1), C->dim(2), C->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
......
...@@ -158,7 +158,7 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -158,7 +158,7 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_); const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), Concat("pooling_opencl_kernel_", output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
......
...@@ -129,7 +129,7 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -129,7 +129,7 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), Concat("resize_bilinear_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
......
...@@ -104,7 +104,7 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -104,7 +104,7 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("softmax_opencl_kernel", output->dim(0), Concat("softmax_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
......
...@@ -103,7 +103,7 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -103,7 +103,7 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = std::string tuning_key =
Concat("winograd_transform_kernel", output_tensor->dim(0), Concat("winograd_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3)); output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
...@@ -217,7 +217,7 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -217,7 +217,7 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = std::string tuning_key =
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3), input_tensor->dim(2)); output_tensor->dim(3), input_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册