提交 eb1e5131 编写于 作者: L Liangliang He

Merge branch 'opencl-lws' into 'master'

Refactor opencl default local work group size.

See merge request !449
......@@ -362,6 +362,11 @@ OpenCLRuntime::OpenCLRuntime():
}
}
device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
&device_gloabl_mem_cache_size_);
device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS,
&device_compute_units_);
const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK");
if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1
&& out_of_range_check[0] == '1') {
......@@ -386,6 +391,14 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
const uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
return device_gloabl_mem_cache_size_;
}
const uint32_t OpenCLRuntime::device_compute_units() const {
return device_compute_units_;
}
bool OpenCLRuntime::BuildProgramFromBinary(
const std::string &built_program_key,
const std::string &build_options_str,
......
......@@ -73,6 +73,8 @@ class OpenCLRuntime {
cl::CommandQueue &command_queue();
const GPUType gpu_type() const;
const std::string platform_info() const;
const uint64_t device_global_mem_cache_size() const;
const uint32_t device_compute_units() const;
cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name,
......@@ -128,6 +130,9 @@ class OpenCLRuntime {
bool program_map_changed_;
std::unique_ptr<KVStorage> storage_;
bool is_profiling_enabled_;
uint64_t device_gloabl_mem_cache_size_;
uint32_t device_compute_units_;
static GPUPerfHint kGPUPerfHint;
static GPUPriorityHint kGPUPriorityHint;
......
OpenCL Image Storage Layout
===
Use **Image** object to optimize memory access and parallel computing based on OpenCL 2.0.
Design the corresponding **Image** format to optimize memory access for different Op algorithm.
Each pixel of **Image** object contains 4 elements(e.g. RGBA).
The Followings are the **Buffer** and **Image** format for all **Tensors**.
Input/Output
---
**Mace** use NHWC format Input/Output.
| Tensor| Buffer| Image Size [Width, Height]| Explanation|
| --------- | :---------:|:--------:|:----:|
|Channel-Major Input/Output | NHWC | [W * (C+3)/4, N * H] | Default Input/Output format|
|Height-Major Input/Output | NHWC | [W * C, N * (H+3)/4] | Winograd Convolution format|
|Width-Major Input/Output | NHWC | [(W+3)/4 * C, N * H] | Winograd Convolution format|
Each Pixel of **Image** contains 4 elements. The below table list the coordination relation
between **Image** and **Buffer**.
| Tensor| Pixel Coordinate Relation| Explanation
| --------- | :---------:| :-----: |
|Channel-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j/H, h=j%H, w=i%W, c=[i/W * 4 + k])}| k=[0, 4)|
|Height-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j%N, h=[j/H*4 + k], w=i%W, c=i/W)}| k=[0, 4)|
|Width-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j/H, h=j%H, w=[i%W*4 + k], c=i/W)}| k=[0, 4)|
Filter
---
| Tensor| Buffer| Image Size [Width, Height]| Explanation|
| --------- | :---------:|:--------:|:----:|
|Convolution Filter | HWOI | [H * W * RoundUp<4>(I), (O+3)/4]|Convolution filter format,There is no difference compared to [H*w*I, (O+3)/4]|
|Depthwise Convlution Filter | HWIM | [H * W * M, (I+3)/4]|Depthwise-Convolution filter format|
Each Pixel of **Image** contains 4 elements. The below table list the coordination relation
between **Image** and **Buffer**.
| Tensor| Pixel Coordinate Relation| Explanation|
| --------- | :---------:| :-----:|
|Convolution Filter | P[m, n] = {E[h, w, o, i] &#124; (h=T/W, w=T%W, o=[n*4+k], i=m%RI)}| RI=((I + 3) / 4) * 4, T=m/RI, k=[0, 4)|
|Depthwise Convlution Filter | P[m, n] = {E[h, w, i, 0] &#124; (h=m/W, w=m%W, i=[n*4+k])}| only support multiplier == 1, k=[0, 4)|
1-D Argument
---
| Tensor| Buffer| Image Size [Width, Height]| Explanation|
| --------- | :---------:|:--------:|:----:|
|1-D Argument | W | [(W+3)/4, 1] | 1D argument format, e.g. Bias|
Each Pixel of **Image** contains 4 elements. The below table list the coordination relation
between **Image** and **Buffer**.
| Tensor| Pixel Coordinate Relation| Explanation|
| --------- | :---------:| :-----:|
|1-D Argument | P[i, 0] = {E[w] &#124; w=i*4+k}| k=[0, 4)|
......@@ -21,7 +21,6 @@
namespace mace {
namespace kernels {
template <typename T>
void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const Tensor *alpha,
......@@ -56,23 +55,23 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
}
switch (activation_) {
case RELU:
tuning_key_prefix_ = "relu_opencl_kernel_";
tuning_key_prefix_ = "relu_opencl_kernel";
built_options.emplace("-DUSE_RELU");
break;
case RELUX:
tuning_key_prefix_ = "relux_opencl_kernel_";
tuning_key_prefix_ = "relux_opencl_kernel";
built_options.emplace("-DUSE_RELUX");
break;
case PRELU:
tuning_key_prefix_ = "prelu_opencl_kernel_";
tuning_key_prefix_ = "prelu_opencl_kernel";
built_options.emplace("-DUSE_PRELU");
break;
case TANH:
tuning_key_prefix_ = "tanh_opencl_kernel_";
tuning_key_prefix_ = "tanh_opencl_kernel";
built_options.emplace("-DUSE_TANH");
break;
case SIGMOID:
tuning_key_prefix_ = "sigmoid_opencl_kernel_";
tuning_key_prefix_ = "sigmoid_opencl_kernel";
built_options.emplace("-DUSE_SIGMOID");
break;
default:
......@@ -110,7 +109,7 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
......
......@@ -106,10 +106,10 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
}
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::stringstream ss;
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< "_" << output_shape[2] << "_" << output_shape[3];
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -116,9 +116,12 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
std::vector<uint32_t> lws(4, 0);
lws[1] = std::min<uint32_t>(gws[1], kwg_size_);
lws[0] = std::min<uint32_t>(4, kwg_size_ / lws[1]);
lws[2] = std::min<uint32_t>(gws[2], kwg_size_ / (lws[1] * lws[0]));
std::string tuning_key =
Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
......
......@@ -79,7 +79,7 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
cl::Event event;
cl_int error;
......
......@@ -90,14 +90,11 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
std::stringstream ss;
ss << "channel_shuffle_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -21,6 +21,23 @@
namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(base, kwg_size / lws_size);
return lws;
}
} // namespace
static void Concat2(cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
......@@ -95,11 +112,11 @@ static void Concat2(cl::Kernel *kernel,
*prev_input_shape = input0->shape();
}
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
std::stringstream ss;
ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
......@@ -149,7 +166,6 @@ static void ConcatN(cl::Kernel *kernel,
index_t chan_blk_offset = 0;
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
for (int i = 0; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
index_t input_channel_blk = input->dim(3) / 4;
......@@ -157,6 +173,7 @@ static void ConcatN(cl::Kernel *kernel,
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -183,6 +200,7 @@ static void ConcatN(cl::Kernel *kernel,
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange,
......
......@@ -20,6 +20,43 @@
namespace mace {
namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else {
lws[0] = gws[0] / 8;
if (lws[0] < base) {
lws[0] = std::max<uint32_t>(gws[0] / 4, base);
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size / compute_units) * 8,
gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
......@@ -130,9 +167,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
*prev_input_shape = input->shape();
}
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
Concat("conv2d_1x1_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
......
......@@ -21,6 +21,34 @@
namespace mace {
namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1);
const uint32_t base = std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize,
4);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base),
kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
RoundUp<uint32_t>(cache_size / kernel_cache_size /
lws_size / compute_units, base),
gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const Tensor *input,
......@@ -128,9 +156,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
*prev_input_shape = input->shape();
}
const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 0};
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
Concat("conv2d_3x3_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
......
......@@ -21,6 +21,42 @@
namespace mace {
namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 20;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kernel_size,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / kernel_size / lws_size / compute_units)
* 8,
gws[2]);
if (lws[2] == 0) {
if (gws[2] < lws_limit) {
lws[2] = gws[2];
} else {
lws[2] = base;
}
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
extern void Conv2dOpencl(cl::Kernel *kernel,
const Tensor *input,
......@@ -130,10 +166,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
*prev_input_shape = input->shape();
}
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
std::string tuning_key =
Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
Concat("conv2d_general_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3),
filter->dim(0), filter->dim(1));
std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(0) * filter->dim(1), *kwg_size);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......
......@@ -33,7 +33,7 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
const char *kernel_name = nullptr;
uint32_t gws[3];
std::stringstream ss;
std::string tuning_key;
index_t output_height, output_width, output_depth;
if (d2s_) {
output_height = input_height * block_size_;
......@@ -46,8 +46,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch);
ss << "depth_to_space_opencl_kernel_" << batch << "_"
<< output_height << "_" << output_width << "_" << output_depth;
tuning_key = Concat("depth_to_space_opencl_kernel", batch, output_height,
output_width, output_depth);
} else {
output_height = input_height / block_size_;
output_width = input_width / block_size_;
......@@ -59,8 +59,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
gws[1] = static_cast<uint32_t>(input_width);
gws[2] = static_cast<uint32_t>(input_height * batch);
ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
<< input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
input->dim(1), input->dim(2), input->dim(3));
}
const index_t input_depth_blocks = RoundUpDiv4(input_depth);
const index_t output_depth_blocks = RoundUpDiv4(output_depth);
......@@ -134,8 +134,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -21,6 +21,37 @@
namespace mace {
namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= min_lws0) {
lws[0] = std::min<uint32_t>(gws[0], min_lws0);
} else {
lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
if (lws[0] < min_lws0) {
lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, min_lws0),
kwg_size / lws[1]);
}
}
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size) * 4,
gws[2]);
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
static void DepthwiseConv2d(cl::Kernel *kernel,
const Tensor *input, // NHWC
const Tensor *filter, // HWIM
......@@ -149,9 +180,9 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
*prev_input_shape = input->shape();
}
const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
batch, height, width, channels, multiplier);
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel",
gws[0], gws[1], gws[2], multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
......
......@@ -116,11 +116,11 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
std::stringstream ss;
ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
......
......@@ -267,10 +267,10 @@ void FCWTXKernel(cl::Kernel *kernel,
*prev_input_shape = input->shape();
}
std::stringstream ss;
ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future);
std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
......
......@@ -206,6 +206,31 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
}
}
std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(3, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[0] = std::min<uint32_t>(base, kwg_size);
lws[1] = kwg_size / lws[1];
return lws;
}
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
return lws;
}
void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
......@@ -216,31 +241,47 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
local_ws[2] =
std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
return {
std::vector<std::vector<uint32_t>> results;
std::vector<std::vector<uint32_t>> candidates = {
// TODO(heliangliang): tuning these magic numbers
{local_ws[0], local_ws[1], local_ws[2], 0},
{kwg_size / 16, 4, 4, 0},
{kwg_size / 32, 4, 8, 0},
{kwg_size / 32, 8, 4, 0},
{kwg_size / 64, 8, 8, 0},
{kwg_size / 64, 16, 4, 0},
{kwg_size / 128, 8, 16, 0},
{kwg_size / 128, 16, 8, 0},
{kwg_size / 128, 32, 4, 0},
{1, kwg_size / 32, 32, 0},
{1, kwg_size / 64, 64, 0},
{1, kwg_size / 128, 128, 0},
{4, kwg_size / 16, 4, 0},
{4, kwg_size / 28, 7, 0},
{4, kwg_size / 32, 8, 0},
{4, kwg_size / 56, 14, 0},
{1, kwg_size, 1, 0},
{gws[0], gws[1], gws[2], 0},
{gws[0], gws[1], gws[2] / 8, 0},
{gws[0], gws[1], gws[2] / 4, 0},
{gws[0], gws[1], 8, 0},
{gws[0], gws[1], 4, 0},
{gws[0], gws[1], 1, 0},
{gws[0] / 4, gws[1], gws[2], 0},
{gws[0] / 4, gws[1], gws[2] / 8, 0},
{gws[0] / 4, gws[1], gws[2] / 4, 0},
{gws[0] / 4, gws[1], 8, 0},
{gws[0] / 4, gws[1], 4, 0},
{gws[0] / 4, gws[1], 1, 0},
{gws[0] / 8, gws[1], gws[2], 0},
{gws[0] / 8, gws[1], gws[2] / 8, 0},
{gws[0] / 8, gws[1], gws[2] / 4, 0},
{gws[0] / 8, gws[1], 8, 0},
{gws[0] / 8, gws[1], 4, 0},
{gws[0] / 8, gws[1], 1, 0},
{4, gws[1], gws[2], 0},
{4, gws[1], gws[2] / 8, 0},
{4, gws[1], gws[2] / 4, 0},
{4, gws[1], 8, 0},
{4, gws[1], 4, 0},
{4, gws[1], 1, 0},
{1, gws[1], gws[2], 0},
{1, gws[1], gws[2] / 8, 0},
{1, gws[1], gws[2] / 4, 0},
{1, gws[1], 8, 0},
{1, gws[1], 4, 0},
{1, gws[1], 1, 0},
};
for (auto &ele : candidates) {
const uint32_t tmp = ele[0] * ele[1] * ele[2];
if (0 < tmp && tmp <= kwg_size) {
results.push_back(ele);
}
}
return results;
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
......@@ -333,19 +374,26 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
uint32_t local_ws[2];
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1], 0},
{local_ws[1], local_ws[0], 0},
{kwg_size / 4, 4, 0},
{kwg_size / 16, 16, 0},
{kwg_size / 32, 32, 0},
{kwg_size / 64, 64, 0},
{kwg_size / 128, 128, 0},
{kwg_size / 256, 256, 0},
{kwg_size, 1, 0},
{1, kwg_size, 0}};
std::vector<std::vector<uint32_t>> results;
std::vector<std::vector<uint32_t>> candidates = {
{kwg_size / 2, 2, 0},
{kwg_size / 4, 4, 0},
{kwg_size / 8, 8, 0},
{kwg_size / 16, 16, 0},
{kwg_size / 32, 32, 0},
{kwg_size / 64, 64, 0},
{kwg_size / 128, 128, 0},
{kwg_size / 256, 256, 0},
{kwg_size, 1, 0},
{1, kwg_size, 0}
};
for (auto &ele : candidates) {
const uint32_t tmp = ele[0] * ele[1] * ele[2];
if (0 < tmp && tmp <= kwg_size) {
results.push_back(ele);
}
}
return results;
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
......@@ -426,5 +474,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
}
} // namespace kernels
} // namespace mace
......@@ -29,6 +29,8 @@ namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds
const int32_t kBaseGPUMemCacheSize = 16384;
enum BufferType {
CONV2D_FILTER = 0,
IN_OUT_CHANNEL = 1,
......@@ -112,6 +114,10 @@ std::string Concat(Args... args) {
return ss.str();
}
std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
const uint32_t kwg_size);
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_
......@@ -85,10 +85,10 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::stringstream ss;
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(2) << "_" << C->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
std::string tuning_key =
Concat("matmul_opencl_kernel", C->dim(0),
C->dim(1), C->dim(2), C->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -100,7 +100,7 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("pad", output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
......
......@@ -21,6 +21,28 @@
namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws_size);
return lws;
}
} // namespace
template <typename T>
void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output,
......@@ -134,11 +156,11 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
};
}
std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
std::stringstream ss;
ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -22,6 +22,34 @@
namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else {
lws[0] = gws[0] / 8;
if (lws[0] == 0) {
lws[0] = gws[0];
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = gws[2] / 8;
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
return lws;
}
} // namespace
template <typename T>
void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input, Tensor *output, StatsFuture *future) {
......@@ -99,11 +127,11 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -72,7 +72,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (int i = 0; i < outputs_count; ++i) {
......
......@@ -22,6 +22,27 @@
namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
std::vector<uint32_t> lws(4, 0);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (gws[0] < base) {
lws[0] = gws[0];
} else {
lws[0] = gws[0] / base;
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
lws[2] = std::min<uint32_t>(gws[2], kwg_size / (lws[0] * lws[1]));
return lws;
}
} // namespace
template <typename T>
void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
Tensor *output,
......@@ -81,11 +102,11 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
input_shape_ = logits->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
std::stringstream ss;
ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
<< "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -105,12 +105,11 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
space_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
std::stringstream ss;
ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
<< batch_tensor->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -102,11 +102,11 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::stringstream ss;
ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
std::string tuning_key =
Concat("winograd_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......@@ -216,12 +216,11 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::stringstream ss;
ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
<< input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
<< input_tensor->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
std::string tuning_key =
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3), input_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册