提交 cf5cae14 编写于 作者: L liuqi

Refactor opencl default local work group size.

上级 708c90ed
...@@ -362,6 +362,11 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -362,6 +362,11 @@ OpenCLRuntime::OpenCLRuntime():
} }
} }
device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
&device_gloabl_mem_cache_size_);
device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS,
&device_compute_units_);
const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK"); const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK");
if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1 if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1
&& out_of_range_check[0] == '1') { && out_of_range_check[0] == '1') {
...@@ -386,6 +391,14 @@ cl::Device &OpenCLRuntime::device() { return *device_; } ...@@ -386,6 +391,14 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; } cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
const uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
return device_gloabl_mem_cache_size_;
}
const uint32_t OpenCLRuntime::device_compute_units() const {
return device_compute_units_;
}
bool OpenCLRuntime::BuildProgramFromBinary( bool OpenCLRuntime::BuildProgramFromBinary(
const std::string &built_program_key, const std::string &built_program_key,
const std::string &build_options_str, const std::string &build_options_str,
......
...@@ -73,6 +73,8 @@ class OpenCLRuntime { ...@@ -73,6 +73,8 @@ class OpenCLRuntime {
cl::CommandQueue &command_queue(); cl::CommandQueue &command_queue();
const GPUType gpu_type() const; const GPUType gpu_type() const;
const std::string platform_info() const; const std::string platform_info() const;
const uint64_t device_global_mem_cache_size() const;
const uint32_t device_compute_units() const;
cl::Kernel BuildKernel(const std::string &program_name, cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
...@@ -128,6 +130,9 @@ class OpenCLRuntime { ...@@ -128,6 +130,9 @@ class OpenCLRuntime {
bool program_map_changed_; bool program_map_changed_;
std::unique_ptr<KVStorage> storage_; std::unique_ptr<KVStorage> storage_;
bool is_profiling_enabled_; bool is_profiling_enabled_;
uint64_t device_gloabl_mem_cache_size_;
uint32_t device_compute_units_;
static GPUPerfHint kGPUPerfHint; static GPUPerfHint kGPUPerfHint;
static GPUPriorityHint kGPUPriorityHint; static GPUPriorityHint kGPUPriorityHint;
......
OpenCL Image Storage Layout
===
Use **Image** object to optimize memory access and parallel computing based on OpenCL 2.0.
Design the corresponding **Image** format to optimize memory access for different Op algorithm.
Each pixel of **Image** object contains 4 elements(e.g. RGBA).
The Followings are the **Buffer** and **Image** format for all **Tensors**.
Input/Output
---
**Mace** use NHWC format Input/Output.
| Tensor| Buffer| Image Size [Width, Height]| Explanation|
| --------- | :---------:|:--------:|:----:|
|Channel-Major Input/Output | NHWC | [W * (C+3)/4, N * H] | Default Input/Output format|
|Height-Major Input/Output | NHWC | [W * C, N * (H+3)/4] | Winograd Convolution format|
|Width-Major Input/Output | NHWC | [(W+3)/4 * C, N * H] | Winograd Convolution format|
Each Pixel of **Image** contains 4 elements. The below table list the coordination relation
between **Image** and **Buffer**.
| Tensor| Pixel Coordinate Relation| Explanation
| --------- | :---------:| :-----: |
|Channel-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j/H, h=j%H, w=i%W, c=[i/W * 4 + k])}| k=[0, 4)|
|Height-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j%N, h=[j/H*4 + k], w=i%W, c=i/W)}| k=[0, 4)|
|Width-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j/H, h=j%H, w=[i%W*4 + k], c=i/W)}| k=[0, 4)|
Filter
---
| Tensor| Buffer| Image Size [Width, Height]| Explanation|
| --------- | :---------:|:--------:|:----:|
|Convolution Filter | HWOI | [H * W * RoundUp<4>(I), (O+3)/4]|Convolution filter format,There is no difference compared to [H*w*I, (O+3)/4]|
|Depthwise Convlution Filter | HWIM | [H * W * M, (I+3)/4]|Depthwise-Convolution filter format|
Each Pixel of **Image** contains 4 elements. The below table list the coordination relation
between **Image** and **Buffer**.
| Tensor| Pixel Coordinate Relation| Explanation|
| --------- | :---------:| :-----:|
|Convolution Filter | P[m, n] = {E[h, w, o, i] &#124; (h=T/W, w=T%W, o=[n*4+k], i=m%RI)}| RI=((I + 3) / 4) * 4, T=m/RI, k=[0, 4)|
|Depthwise Convlution Filter | P[m, n] = {E[h, w, i, 0] &#124; (h=m/W, w=m%W, i=[n*4+k])}| only support multiplier == 1, k=[0, 4)|
1-D Argument
---
| Tensor| Buffer| Image Size [Width, Height]| Explanation|
| --------- | :---------:|:--------:|:----:|
|1-D Argument | W | [(W+3)/4, 1] | 1D argument format, e.g. Bias|
Each Pixel of **Image** contains 4 elements. The below table list the coordination relation
between **Image** and **Buffer**.
| Tensor| Pixel Coordinate Relation| Explanation|
| --------- | :---------:| :-----:|
|1-D Argument | P[i, 0] = {E[w] &#124; w=i*4+k}| k=[0, 4)|
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
...@@ -56,23 +55,23 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -56,23 +55,23 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
} }
switch (activation_) { switch (activation_) {
case RELU: case RELU:
tuning_key_prefix_ = "relu_opencl_kernel_"; tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU"); built_options.emplace("-DUSE_RELU");
break; break;
case RELUX: case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel_"; tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX"); built_options.emplace("-DUSE_RELUX");
break; break;
case PRELU: case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel_"; tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU"); built_options.emplace("-DUSE_PRELU");
break; break;
case TANH: case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel_"; tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH"); built_options.emplace("-DUSE_TANH");
break; break;
case SIGMOID: case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel_"; tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID"); built_options.emplace("-DUSE_SIGMOID");
break; break;
default: default:
...@@ -110,7 +109,7 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -110,7 +109,7 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3)); output->dim(3));
......
...@@ -106,10 +106,10 @@ void AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -106,10 +106,10 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::stringstream ss; std::string tuning_key =
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
<< "_" << output_shape[2] << "_" << output_shape[3]; output_tensor->dim(2), output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -116,9 +116,12 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -116,9 +116,12 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; std::vector<uint32_t> lws(4, 0);
lws[1] = std::min<uint32_t>(gws[1], kwg_size_);
lws[0] = std::min<uint32_t>(4, kwg_size_ / lws[1]);
lws[2] = std::min<uint32_t>(gws[2], kwg_size_ / (lws[1] * lws[0]));
std::string tuning_key = std::string tuning_key =
Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_); output->dim(1), output->dim(2), output->dim(3), folded_constant_);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
......
...@@ -79,7 +79,7 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -79,7 +79,7 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8}; const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
cl::Event event; cl::Event event;
cl_int error; cl_int error;
......
...@@ -90,14 +90,11 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -90,14 +90,11 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::stringstream ss; std::string tuning_key =
ss << "channel_shuffle_opencl_kernel_" Concat("channel_shuffle_opencl_kernel", output->dim(0),
<< output->dim(0) << "_" output->dim(1), output->dim(2), output->dim(3));
<< output->dim(1) << "_" TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -21,6 +21,23 @@ ...@@ -21,6 +21,23 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(base, kwg_size / lws_size);
return lws;
}
} // namespace
static void Concat2(cl::Kernel *kernel, static void Concat2(cl::Kernel *kernel,
const Tensor *input0, const Tensor *input0,
const Tensor *input1, const Tensor *input1,
...@@ -95,11 +112,11 @@ static void Concat2(cl::Kernel *kernel, ...@@ -95,11 +112,11 @@ static void Concat2(cl::Kernel *kernel,
*prev_input_shape = input0->shape(); *prev_input_shape = input0->shape();
} }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0}; const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::stringstream ss; std::string tuning_key =
ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) Concat("concat_opencl_kernel", output->dim(0),
<< "_" << output->dim(2) << "_" << output->dim(3); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
...@@ -149,7 +166,6 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -149,7 +166,6 @@ static void ConcatN(cl::Kernel *kernel,
index_t chan_blk_offset = 0; index_t chan_blk_offset = 0;
cl::Event event; cl::Event event;
CallStats call_stats{INT64_MAX, 0}; CallStats call_stats{INT64_MAX, 0};
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
for (int i = 0; i < inputs_count; ++i) { for (int i = 0; i < inputs_count; ++i) {
const Tensor *input = input_list[i]; const Tensor *input = input_list[i];
index_t input_channel_blk = input->dim(3) / 4; index_t input_channel_blk = input->dim(3) / 4;
...@@ -157,6 +173,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -157,6 +173,7 @@ static void ConcatN(cl::Kernel *kernel,
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width), static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -183,6 +200,7 @@ static void ConcatN(cl::Kernel *kernel, ...@@ -183,6 +200,7 @@ static void ConcatN(cl::Kernel *kernel,
for (size_t j = 0; j < 3; ++j) { for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]); roundup_gws[j] = RoundUp(gws[j], lws[j]);
} }
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, *kernel, cl::NullRange,
......
...@@ -20,6 +20,39 @@ ...@@ -20,6 +20,39 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else {
lws[0] = gws[0] / 8;
if (lws[0] < base) {
lws[0] = std::max<uint32_t>(gws[0] / 4, base);
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size / compute_units) * 8,
gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
extern void Conv2dOpenclK1x1(cl::Kernel *kernel, extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -130,9 +163,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -130,9 +163,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0}; std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), Concat("conv2d_1x1_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
......
...@@ -22,6 +22,35 @@ ...@@ -22,6 +22,35 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1);
const uint32_t base = std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize,
4);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base),
kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
RoundUp<uint32_t>(cache_size / kernel_cache_size /
lws_size / compute_units, base),
gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
extern void Conv2dOpenclK3x3(cl::Kernel *kernel, extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -128,9 +157,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -128,9 +157,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 0}; const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), Concat("conv2d_3x3_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
......
...@@ -21,6 +21,42 @@ ...@@ -21,6 +21,42 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 20;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kernel_size,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / kernel_size / lws_size / compute_units)
* 8,
gws[2]);
if (lws[2] == 0) {
if (gws[2] < lws_limit) {
lws[2] = gws[2];
} else {
lws[2] = base;
}
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
extern void Conv2dOpencl(cl::Kernel *kernel, extern void Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
...@@ -130,10 +166,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -130,10 +166,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), Concat("conv2d_general_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3),
filter->dim(0), filter->dim(1));
std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(0) * filter->dim(1), *kwg_size);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -33,7 +33,7 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -33,7 +33,7 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
const char *kernel_name = nullptr; const char *kernel_name = nullptr;
uint32_t gws[3]; uint32_t gws[3];
std::stringstream ss; std::string tuning_key;
index_t output_height, output_width, output_depth; index_t output_height, output_width, output_depth;
if (d2s_) { if (d2s_) {
output_height = input_height * block_size_; output_height = input_height * block_size_;
...@@ -46,8 +46,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -46,8 +46,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth)); gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
gws[1] = static_cast<uint32_t>(output_width); gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch); gws[2] = static_cast<uint32_t>(output_height * batch);
ss << "depth_to_space_opencl_kernel_" << batch << "_" tuning_key = Concat("depth_to_space_opencl_kernel", batch, output_height,
<< output_height << "_" << output_width << "_" << output_depth; output_width, output_depth);
} else { } else {
output_height = input_height / block_size_; output_height = input_height / block_size_;
output_width = input_width / block_size_; output_width = input_width / block_size_;
...@@ -59,8 +59,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -59,8 +59,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth)); gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
gws[1] = static_cast<uint32_t>(input_width); gws[1] = static_cast<uint32_t>(input_width);
gws[2] = static_cast<uint32_t>(input_height * batch); gws[2] = static_cast<uint32_t>(input_height * batch);
ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_" tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
<< input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3); input->dim(1), input->dim(2), input->dim(3));
} }
const index_t input_depth_blocks = RoundUpDiv4(input_depth); const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth); const index_t output_depth_blocks = RoundUpDiv4(output_depth);
...@@ -134,8 +134,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -134,8 +134,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -21,6 +21,37 @@ ...@@ -21,6 +21,37 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= min_lws0) {
lws[0] = std::min<uint32_t>(gws[0], min_lws0);
} else {
lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
if (lws[0] < min_lws0) {
lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, min_lws0),
kwg_size / lws[1]);
}
}
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size) * 4,
gws[2]);
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
static void DepthwiseConv2d(cl::Kernel *kernel, static void DepthwiseConv2d(cl::Kernel *kernel,
const Tensor *input, // NHWC const Tensor *input, // NHWC
const Tensor *filter, // HWIM const Tensor *filter, // HWIM
...@@ -149,9 +180,9 @@ static void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -149,9 +180,9 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0}; const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel",
batch, height, width, channels, multiplier); gws[0], gws[1], gws[2], multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
......
...@@ -116,11 +116,11 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -116,11 +116,11 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
input_shape_ = input0->shape(); input_shape_ = input0->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::stringstream ss; std::string tuning_key =
ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) Concat("eltwise_opencl_kernel", output->dim(0),
<< "_" << output->dim(2) << "_" << output->dim(3); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>(); char *kerror_code = kernel_error_->mutable_data<char>();
......
...@@ -267,10 +267,10 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -267,10 +267,10 @@ void FCWTXKernel(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
std::stringstream ss; std::string tuning_key =
ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" Concat("fc_opencl_kernel", output->dim(0),
<< output->dim(2) << "_" << output->dim(3); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future); TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
......
...@@ -206,6 +206,32 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { ...@@ -206,6 +206,32 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
} }
} }
std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(3, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[0] = std::min<uint32_t>(base, kwg_size);
lws[1] = kwg_size / lws[1];
return lws;
}
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
return lws;
}
void TuningOrRun3DKernel(const cl::Kernel &kernel, void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
...@@ -216,31 +242,47 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -216,31 +242,47 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
std::vector<uint32_t> local_ws(3, 0); std::vector<std::vector<uint32_t>> results;
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size); std::vector<std::vector<uint32_t>> candidates = {
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
local_ws[2] =
std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
return {
// TODO(heliangliang): tuning these magic numbers // TODO(heliangliang): tuning these magic numbers
{local_ws[0], local_ws[1], local_ws[2], 0}, {gws[0], gws[1], gws[2], 0},
{kwg_size / 16, 4, 4, 0}, {gws[0], gws[1], gws[2] / 8, 0},
{kwg_size / 32, 4, 8, 0}, {gws[0], gws[1], gws[2] / 4, 0},
{kwg_size / 32, 8, 4, 0}, {gws[0], gws[1], 8, 0},
{kwg_size / 64, 8, 8, 0}, {gws[0], gws[1], 4, 0},
{kwg_size / 64, 16, 4, 0}, {gws[0], gws[1], 1, 0},
{kwg_size / 128, 8, 16, 0}, {gws[0] / 4, gws[1], gws[2], 0},
{kwg_size / 128, 16, 8, 0}, {gws[0] / 4, gws[1], gws[2] / 8, 0},
{kwg_size / 128, 32, 4, 0}, {gws[0] / 4, gws[1], gws[2] / 4, 0},
{1, kwg_size / 32, 32, 0}, {gws[0] / 4, gws[1], 8, 0},
{1, kwg_size / 64, 64, 0}, {gws[0] / 4, gws[1], 4, 0},
{1, kwg_size / 128, 128, 0}, {gws[0] / 4, gws[1], 1, 0},
{4, kwg_size / 16, 4, 0}, {gws[0] / 8, gws[1], gws[2], 0},
{4, kwg_size / 28, 7, 0}, {gws[0] / 8, gws[1], gws[2] / 8, 0},
{4, kwg_size / 32, 8, 0}, {gws[0] / 8, gws[1], gws[2] / 4, 0},
{4, kwg_size / 56, 14, 0}, {gws[0] / 8, gws[1], 8, 0},
{1, kwg_size, 1, 0}, {gws[0] / 8, gws[1], 4, 0},
{gws[0] / 8, gws[1], 1, 0},
{4, gws[1], gws[2], 0},
{4, gws[1], gws[2] / 8, 0},
{4, gws[1], gws[2] / 4, 0},
{4, gws[1], 8, 0},
{4, gws[1], 4, 0},
{4, gws[1], 1, 0},
{1, gws[1], gws[2], 0},
{1, gws[1], gws[2] / 8, 0},
{1, gws[1], gws[2] / 4, 0},
{1, gws[1], 8, 0},
{1, gws[1], 4, 0},
{1, gws[1], 1, 0},
}; };
for (auto &ele : candidates) {
const uint32_t tmp = ele[0] * ele[1] * ele[2];
if (0 < tmp && tmp <= kwg_size) {
results.push_back(ele);
}
}
return results;
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params, Timer *timer, auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
...@@ -333,19 +375,26 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -333,19 +375,26 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
uint32_t local_ws[2]; std::vector<std::vector<uint32_t>> results;
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size); std::vector<std::vector<uint32_t>> candidates = {
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]); {kwg_size / 2, 2, 0},
return {{local_ws[0], local_ws[1], 0}, {kwg_size / 4, 4, 0},
{local_ws[1], local_ws[0], 0}, {kwg_size / 8, 8, 0},
{kwg_size / 4, 4, 0}, {kwg_size / 16, 16, 0},
{kwg_size / 16, 16, 0}, {kwg_size / 32, 32, 0},
{kwg_size / 32, 32, 0}, {kwg_size / 64, 64, 0},
{kwg_size / 64, 64, 0}, {kwg_size / 128, 128, 0},
{kwg_size / 128, 128, 0}, {kwg_size / 256, 256, 0},
{kwg_size / 256, 256, 0}, {kwg_size, 1, 0},
{kwg_size, 1, 0}, {1, kwg_size, 0}
{1, kwg_size, 0}}; };
for (auto &ele : candidates) {
const uint32_t tmp = ele[0] * ele[1] * ele[2];
if (0 < tmp && tmp <= kwg_size) {
results.push_back(ele);
}
}
return results;
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params, Timer *timer, auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
...@@ -426,5 +475,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -426,5 +475,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
} }
} }
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -29,6 +29,8 @@ namespace kernels { ...@@ -29,6 +29,8 @@ namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds const float kMaxKernelExeTime = 1000.0; // microseconds
const int32_t kBaseGPUMemCacheSize = 16384;
enum BufferType { enum BufferType {
CONV2D_FILTER = 0, CONV2D_FILTER = 0,
IN_OUT_CHANNEL = 1, IN_OUT_CHANNEL = 1,
...@@ -112,6 +114,11 @@ std::string Concat(Args... args) { ...@@ -112,6 +114,11 @@ std::string Concat(Args... args) {
return ss.str(); return ss.str();
} }
std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
const uint32_t kwg_size);
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_ #endif // MACE_KERNELS_OPENCL_HELPER_H_
...@@ -85,10 +85,10 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -85,10 +85,10 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2)))); kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::stringstream ss; std::string tuning_key =
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" Concat("matmul_opencl_kernel", C->dim(0),
<< C->dim(2) << "_" << C->dim(3); C->dim(1), C->dim(2), C->dim(3));
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -100,7 +100,7 @@ void PadFunctor<DeviceType::GPU, T>::operator()( ...@@ -100,7 +100,7 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("pad", output->dim(0), output->dim(1), output->dim(2), Concat("pad", output->dim(0), output->dim(1), output->dim(2),
output->dim(3)); output->dim(3));
......
...@@ -21,6 +21,28 @@ ...@@ -21,6 +21,28 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws_size);
return lws;
}
} // namespace
template <typename T> template <typename T>
void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -134,11 +156,11 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -134,11 +156,11 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
}; };
} }
std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
std::stringstream ss; std::string tuning_key =
ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) Concat("pooling_opencl_kernel_", output->dim(0),
<< "_" << output->dim(2) << "_" << output->dim(3); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -22,6 +22,34 @@ ...@@ -22,6 +22,34 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else {
lws[0] = gws[0] / 8;
if (lws[0] == 0) {
lws[0] = gws[0];
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = gws[2] / 8;
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
template <typename T> template <typename T>
void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) { const Tensor *input, Tensor *output, StatsFuture *future) {
...@@ -99,11 +127,11 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -99,11 +127,11 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::stringstream ss; std::string tuning_key =
ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" Concat("resize_bilinear_opencl_kernel", output->dim(0),
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -72,7 +72,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -72,7 +72,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(input->dim(0) * input->dim(1)), static_cast<uint32_t>(input->dim(0) * input->dim(1)),
}; };
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1}; const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
cl::Event event; cl::Event event;
CallStats call_stats{INT64_MAX, 0}; CallStats call_stats{INT64_MAX, 0};
for (int i = 0; i < outputs_count; ++i) { for (int i = 0; i < outputs_count; ++i) {
......
...@@ -22,6 +22,27 @@ ...@@ -22,6 +22,27 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
std::vector<uint32_t> lws(4, 0);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (gws[0] < base) {
lws[0] = gws[0];
} else {
lws[0] = gws[0] / base;
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
lws[2] = std::min<uint32_t>(gws[2], kwg_size / (lws[0] * lws[1]));
return lws;
}
} // namespace
template <typename T> template <typename T>
void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
Tensor *output, Tensor *output,
...@@ -81,11 +102,11 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -81,11 +102,11 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
input_shape_ = logits->shape(); input_shape_ = logits->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::stringstream ss; std::string tuning_key =
ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) Concat("softmax_opencl_kernel", output->dim(0),
<< "_" << output->dim(2) << "_" << output->dim(3); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -105,12 +105,11 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -105,12 +105,11 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
space_shape_ = space_tensor->shape(); space_shape_ = space_tensor->shape();
} }
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0}; const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::stringstream ss; std::string tuning_key =
ss << kernel_name << "_" << batch_tensor->dim(0) << "_" Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
<< batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" batch_tensor->dim(2), batch_tensor->dim(3));
<< batch_tensor->dim(3); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -102,11 +102,11 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -102,11 +102,11 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::stringstream ss; std::string tuning_key =
ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_" Concat("winograd_transform_kernel", output_tensor->dim(0),
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" output_tensor->dim(1), output_tensor->dim(2),
<< input_tensor->dim(3); output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
...@@ -216,12 +216,11 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -216,12 +216,11 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key =
std::stringstream ss; Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_" output_tensor->dim(1), output_tensor->dim(2),
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" output_tensor->dim(3), input_tensor->dim(2));
<< input_tensor->dim(3); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册