提交 8f5e912e 编写于 作者: X xiebaiyuan 提交者: GitHub

[LITE][OPENCL][Image] mv kernel init gws lws into prepare for run (#3285)

* [LITE][OPENCL][Image] mv kernel init gws lws into prepare for run, test=develop

* [LITE][OPENCL][Image] shut down profile, test=develop

* [LITE][OPENCL][Image] move log ahead, test=develop
上级 e3cf724e
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "lite/kernels/opencl/conv_image_compute.h" #include "lite/kernels/opencl/conv_image_compute.h"
#include <iomanip>
#include <sstream> #include <sstream>
#include "lite/backends/opencl/cl_image_converter.h" #include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
...@@ -78,9 +78,27 @@ void ConvImageCompute::PrepareForRun() { ...@@ -78,9 +78,27 @@ void ConvImageCompute::PrepareForRun() {
VLOG(3) << "dilation_equal:" << dilation_equal; VLOG(3) << "dilation_equal:" << dilation_equal;
VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " " VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
<< paddings[2] << " " << paddings[3]; << paddings[2] << " " << paddings[3];
CHECK(pad_equal && stride_equal && dilation_equal); CHECK(pad_equal && stride_equal && dilation_equal);
// general gws..
auto out_image_shape = InitImageDimInfoWith(output_dims);
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
default_c_blk_ = default_work_size[0];
default_w_blk_ = default_work_size[1];
default_nh_blk_ = default_work_size[2];
c_blk_ = default_c_blk_;
w_blk_ = default_w_blk_;
nh_blk_ = default_nh_blk_;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
if (kernel_h == 1 && kernel_w == 1) { if (kernel_h == 1 && kernel_w == 1) {
// conv2d_1x1 // conv2d_1x1
if (param.x->dims()[1] % 4 == 0) { if (param.x->dims()[1] % 4 == 0) {
...@@ -99,6 +117,15 @@ void ConvImageCompute::PrepareForRun() { ...@@ -99,6 +117,15 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d1x1opt; impl_ = &ConvImageCompute::Conv2d1x1opt;
{
// calc 1x1 gws
w_blk_ = maptofactor(default_w_blk_, 4);
c_blk_ = default_c_blk_;
nh_blk_ = default_nh_blk_;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#define DEPTH_CONV_USE_SPL #define DEPTH_CONV_USE_SPL
#ifdef DEPTH_CONV_USE_SPL #ifdef DEPTH_CONV_USE_SPL
} else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
...@@ -107,9 +134,38 @@ void ConvImageCompute::PrepareForRun() { ...@@ -107,9 +134,38 @@ void ConvImageCompute::PrepareForRun() {
if (stride_h == 1 && dilations[0] == 1) { if (stride_h == 1 && dilations[0] == 1) {
kernel_func_names_.push_back("depth_conv2d_3x3s1"); kernel_func_names_.push_back("depth_conv2d_3x3s1");
impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1; impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
{
// depthwise spl gws s1
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
int w_blk_size = 2;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
c_blk_ = c_block;
w_blk_ = w_blk;
nh_blk_ = nh;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
} else { } else {
kernel_func_names_.push_back("depth_conv2d_3x3"); kernel_func_names_.push_back("depth_conv2d_3x3");
impl_ = &ConvImageCompute::DepthwiseConv2d3x3; impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
{
// depthwise spl gws
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
c_blk_ = c_block;
w_blk_ = w;
nh_blk_ = nh;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
} }
kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl"); kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
...@@ -157,6 +213,22 @@ void ConvImageCompute::PrepareForRun() { ...@@ -157,6 +213,22 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d3x3opt; impl_ = &ConvImageCompute::Conv2d3x3opt;
{
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
} else if (kernel_h == 5 && kernel_w == 5) { } else if (kernel_h == 5 && kernel_w == 5) {
#define CONV_5x5_OPT #define CONV_5x5_OPT
#ifndef CONV_5x5_OPT #ifndef CONV_5x5_OPT
...@@ -189,6 +261,21 @@ void ConvImageCompute::PrepareForRun() { ...@@ -189,6 +261,21 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d5x5opt; impl_ = &ConvImageCompute::Conv2d5x5opt;
{
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#endif #endif
#undef CONV_5x5_OPT #undef CONV_5x5_OPT
} else if (kernel_h == 7 && kernel_w == 7) { } else if (kernel_h == 7 && kernel_w == 7) {
...@@ -223,6 +310,21 @@ void ConvImageCompute::PrepareForRun() { ...@@ -223,6 +310,21 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d7x7opt; impl_ = &ConvImageCompute::Conv2d7x7opt;
{
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#endif #endif
#undef CONV_7x7_OPT #undef CONV_7x7_OPT
...@@ -270,9 +372,36 @@ void ConvImageCompute::PrepareForRun() { ...@@ -270,9 +372,36 @@ void ConvImageCompute::PrepareForRun() {
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]); kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
} }
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
size_t max_work_group_size = 0;
kernel_.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
VLOG(4) << "max_work_group_size: " << max_work_group_size;
if (max_work_group_size > 0 && use_lws) {
// local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1(
// global_work_size_, max_work_group_size);
local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_,
max_work_group_size);
VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
<< local_work_size_[1] << "," << local_work_size_[2] << "}";
}
} }
void ConvImageCompute::Conv2d1x1opt() { void ConvImageCompute::Conv2d1x1opt() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims(); auto input_dims = param.x->dims();
auto paddings = *param.paddings; auto paddings = *param.paddings;
...@@ -302,16 +431,28 @@ void ConvImageCompute::Conv2d1x1opt() { ...@@ -302,16 +431,28 @@ void ConvImageCompute::Conv2d1x1opt() {
int input_c = input_dims[1]; int input_c = input_dims[1];
auto dilations = *param.dilations; auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size = // const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims, // DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{ // DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]), // static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])})); // static_cast<int64_t>(out_image_shape["height"])}));
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// int maped_w = maptofactor(w, 4);
int c_block = default_work_size[0]; // auto global_work_size_ =
int w = default_work_size[1]; // cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
int nh = default_work_size[2]; // static_cast<size_t>(maped_w),
// static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d_1x1 params ============"; VLOG(4) << "============ conv2d_1x1 params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
...@@ -331,9 +472,9 @@ void ConvImageCompute::Conv2d1x1opt() { ...@@ -331,9 +472,9 @@ void ConvImageCompute::Conv2d1x1opt() {
VLOG(4) << "offset: " << offset; VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: " // VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << "" // << "{" << c_block << ", " << w << ", " << nh << ""
<< "}"; // << "}";
#endif #endif
CHECK_GE(dilations.size(), 2); CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]); CHECK(dilations[0] == dilations[1]);
...@@ -350,27 +491,14 @@ void ConvImageCompute::Conv2d1x1opt() { ...@@ -350,27 +491,14 @@ void ConvImageCompute::Conv2d1x1opt() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>(); bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
} }
auto& context = ctx_->As<OpenCLContext>(); auto kernel = kernel_;
CHECK(context.cl_context() != nullptr);
std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int maped_w = maptofactor(w, 4);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "maped_w: " << maped_w;
VLOG(4) << "hasbias: " << has_bias;
#endif
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, maped_w); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image); status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -401,49 +529,87 @@ void ConvImageCompute::Conv2d1x1opt() { ...@@ -401,49 +529,87 @@ void ConvImageCompute::Conv2d1x1opt() {
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height); status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w); status = kernel.setArg(++arg_idx, default_w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(maped_w),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
local_work_size, local_work_size_,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
#ifdef PROFILE_CONV_KERNEL
bool use_profile = false;
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
double start = GetCurrentUS();
if (use_profile) {
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
} else {
int count = 50;
double sumtime = 0;
if (!use_profile) {
count = 1;
}
for (size_t i = 0; i < count; i++) {
start = GetCurrentUS();
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (use_profile) {
event_->wait();
double duration = GetCurrentUS() - start;
sumtime += duration;
}
}
auto dims_string = [](DDimLite dims) -> std::string {
std::ostringstream stream;
stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << ","
<< dims[3] << "]";
return stream.str();
};
if (use_profile) {
// LOG(INFO) << "input: " << input_dims;
// LOG(INFO) << "filter: " << filter_dims;
// LOG(INFO) << "output: " << output_dims;
std::cout << std::setw(25) << std::left << dims_string(input_dims)
<< std::setw(25) << std::left << dims_string(filter_dims)
<< std::setw(25) << std::left << dims_string(output_dims)
<< std::setw(25) << std::left << sumtime / count << std::endl;
} else {
dims_string(input_dims);
}
}
#endif
} }
void ConvImageCompute::Conv2d3x3() { void ConvImageCompute::Conv2d3x3() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims(); auto input_dims = param.x->dims();
auto paddings = *param.paddings; auto paddings = *param.paddings;
...@@ -486,24 +652,14 @@ void ConvImageCompute::Conv2d3x3() { ...@@ -486,24 +652,14 @@ void ConvImageCompute::Conv2d3x3() {
} else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) { } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
new_groups = input_channel / filter_channel; new_groups = input_channel / filter_channel;
} }
/* TODO(ysh329): mobile has no case below /* TODO(ysh329): mobile has no case below
else { else {
LOG(FATAL) << "Not support conv3x3 case with" LOG(FATAL) << "Not support conv3x3 case with"
<< " input_dims:" << input_dims << " output_dims:" << << " input_dims:" << input_dims << " output_dims:" <<
output_dims output_dims
<< " filter_dims:" << filter_dims; << " filter_dims:" << filter_dims;
} }
*/ */
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============"; VLOG(4) << "============ conv2d params ============";
...@@ -527,9 +683,9 @@ void ConvImageCompute::Conv2d3x3() { ...@@ -527,9 +683,9 @@ void ConvImageCompute::Conv2d3x3() {
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "param.groups(groups):" << param.groups; VLOG(4) << "param.groups(groups):" << param.groups;
VLOG(4) << "new_groups:" << new_groups; VLOG(4) << "new_groups:" << new_groups;
VLOG(4) << "default work size{c_block, w, nh}: " // VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << "" // << "{" << c_block << ", " << w << ", " << nh << ""
<< "}"; // << "}";
#endif #endif
CHECK_GE(dilations.size(), 2); CHECK_GE(dilations.size(), 2);
...@@ -544,26 +700,15 @@ void ConvImageCompute::Conv2d3x3() { ...@@ -544,26 +700,15 @@ void ConvImageCompute::Conv2d3x3() {
if (has_bias) { if (has_bias) {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>(); bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
} }
auto kernel = kernel_;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image); status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -607,21 +752,16 @@ void ConvImageCompute::Conv2d3x3() { ...@@ -607,21 +752,16 @@ void ConvImageCompute::Conv2d3x3() {
status = kernel.setArg(++arg_idx, new_groups); status = kernel.setArg(++arg_idx, new_groups);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image; // VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}"; << global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif #endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
...@@ -630,6 +770,8 @@ void ConvImageCompute::Conv2d3x3() { ...@@ -630,6 +770,8 @@ void ConvImageCompute::Conv2d3x3() {
} }
void ConvImageCompute::Conv2d3x3opt() { void ConvImageCompute::Conv2d3x3opt() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims(); auto input_dims = param.x->dims();
auto paddings = *param.paddings; auto paddings = *param.paddings;
...@@ -657,24 +799,6 @@ void ConvImageCompute::Conv2d3x3opt() { ...@@ -657,24 +799,6 @@ void ConvImageCompute::Conv2d3x3opt() {
const bool is_element_wise_bias = const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims(); has_bias && param.output->dims() == param.bias->dims();
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 5;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
// default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
// default_work_size[2] = h_blk;
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============"; VLOG(4) << "============ conv2d params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
...@@ -692,9 +816,6 @@ void ConvImageCompute::Conv2d3x3opt() { ...@@ -692,9 +816,6 @@ void ConvImageCompute::Conv2d3x3opt() {
VLOG(4) << "strides: " << strides[0] << "," << strides[1]; VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif #endif
CHECK_GE(dilations.size(), 2); CHECK_GE(dilations.size(), 2);
...@@ -710,24 +831,15 @@ void ConvImageCompute::Conv2d3x3opt() { ...@@ -710,24 +831,15 @@ void ConvImageCompute::Conv2d3x3opt() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>(); bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
} }
auto& context = ctx_->As<OpenCLContext>(); auto kernel = kernel_;
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
#endif
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, h_blk); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image); status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -763,38 +875,17 @@ void ConvImageCompute::Conv2d3x3opt() { ...@@ -763,38 +875,17 @@ void ConvImageCompute::Conv2d3x3opt() {
status = kernel.setArg(++arg_idx, output_height); status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(w_blk),
static_cast<size_t>(h_blk)};
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image; // VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}"; << global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif #endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
local_work_size, local_work_size_,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -802,6 +893,8 @@ void ConvImageCompute::Conv2d3x3opt() { ...@@ -802,6 +893,8 @@ void ConvImageCompute::Conv2d3x3opt() {
} }
void ConvImageCompute::Conv2d5x5() { void ConvImageCompute::Conv2d5x5() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims(); auto input_dims = param.x->dims();
auto paddings = *param.paddings; auto paddings = *param.paddings;
...@@ -833,16 +926,6 @@ void ConvImageCompute::Conv2d5x5() { ...@@ -833,16 +926,6 @@ void ConvImageCompute::Conv2d5x5() {
int input_c = input_dims[1]; int input_c = input_dims[1];
auto dilations = *param.dilations; auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============"; VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
...@@ -863,9 +946,6 @@ void ConvImageCompute::Conv2d5x5() { ...@@ -863,9 +946,6 @@ void ConvImageCompute::Conv2d5x5() {
VLOG(4) << "offset: " << offset; VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif #endif
CHECK_GE(dilations.size(), 2); CHECK_GE(dilations.size(), 2);
...@@ -881,25 +961,15 @@ void ConvImageCompute::Conv2d5x5() { ...@@ -881,25 +961,15 @@ void ConvImageCompute::Conv2d5x5() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>(); bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
} }
auto& context = ctx_->As<OpenCLContext>(); auto kernel = kernel_;
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image); status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -933,21 +1003,16 @@ void ConvImageCompute::Conv2d5x5() { ...@@ -933,21 +1003,16 @@ void ConvImageCompute::Conv2d5x5() {
status = kernel.setArg(++arg_idx, output_height); status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image; // VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}"; << global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif #endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
...@@ -956,6 +1021,8 @@ void ConvImageCompute::Conv2d5x5() { ...@@ -956,6 +1021,8 @@ void ConvImageCompute::Conv2d5x5() {
} }
void ConvImageCompute::Conv2d5x5opt() { void ConvImageCompute::Conv2d5x5opt() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims(); auto input_dims = param.x->dims();
auto paddings = *param.paddings; auto paddings = *param.paddings;
...@@ -984,22 +1051,6 @@ void ConvImageCompute::Conv2d5x5opt() { ...@@ -984,22 +1051,6 @@ void ConvImageCompute::Conv2d5x5opt() {
const bool is_element_wise_bias = const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims(); has_bias && param.output->dims() == param.bias->dims();
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 5;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
// default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
// default_work_size[2] = h_blk; // default_work_size[2] = h_blk;
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============"; VLOG(4) << "============ conv2d params ============";
...@@ -1018,9 +1069,6 @@ void ConvImageCompute::Conv2d5x5opt() { ...@@ -1018,9 +1069,6 @@ void ConvImageCompute::Conv2d5x5opt() {
VLOG(4) << "strides: " << strides[0] << "," << strides[1]; VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif #endif
CHECK_GE(dilations.size(), 2); CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]); CHECK(dilations[0] == dilations[1]);
...@@ -1035,22 +1083,14 @@ void ConvImageCompute::Conv2d5x5opt() { ...@@ -1035,22 +1083,14 @@ void ConvImageCompute::Conv2d5x5opt() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>(); bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
} }
auto& context = ctx_->As<OpenCLContext>(); auto kernel = kernel_;
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
#endif
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, h_blk); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image); status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -1083,38 +1123,13 @@ void ConvImageCompute::Conv2d5x5opt() { ...@@ -1083,38 +1123,13 @@ void ConvImageCompute::Conv2d5x5opt() {
status = kernel.setArg(++arg_idx, output_height); status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size = // VLOG(4) << "out_image: " << out_image;
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(w_blk),
static_cast<size_t>(h_blk)};
// VLOG(4) << "out_image: " << out_image;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
local_work_size, local_work_size_,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -1122,6 +1137,8 @@ void ConvImageCompute::Conv2d5x5opt() { ...@@ -1122,6 +1137,8 @@ void ConvImageCompute::Conv2d5x5opt() {
} }
void ConvImageCompute::Conv2d7x7() { void ConvImageCompute::Conv2d7x7() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims(); auto input_dims = param.x->dims();
auto paddings = *param.paddings; auto paddings = *param.paddings;
...@@ -1153,16 +1170,6 @@ void ConvImageCompute::Conv2d7x7() { ...@@ -1153,16 +1170,6 @@ void ConvImageCompute::Conv2d7x7() {
int input_c = input_dims[1]; int input_c = input_dims[1];
auto dilations = *param.dilations; auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============"; VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
...@@ -1183,9 +1190,6 @@ void ConvImageCompute::Conv2d7x7() { ...@@ -1183,9 +1190,6 @@ void ConvImageCompute::Conv2d7x7() {
VLOG(4) << "offset: " << offset; VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif #endif
CHECK_GE(dilations.size(), 2); CHECK_GE(dilations.size(), 2);
...@@ -1201,25 +1205,15 @@ void ConvImageCompute::Conv2d7x7() { ...@@ -1201,25 +1205,15 @@ void ConvImageCompute::Conv2d7x7() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>(); bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
} }
auto& context = ctx_->As<OpenCLContext>(); auto kernel = kernel_;
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image); status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -1253,21 +1247,16 @@ void ConvImageCompute::Conv2d7x7() { ...@@ -1253,21 +1247,16 @@ void ConvImageCompute::Conv2d7x7() {
status = kernel.setArg(++arg_idx, output_height); status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image; // VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}"; << global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif #endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
...@@ -1275,6 +1264,8 @@ void ConvImageCompute::Conv2d7x7() { ...@@ -1275,6 +1264,8 @@ void ConvImageCompute::Conv2d7x7() {
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
} }
void ConvImageCompute::Conv2d7x7opt() { void ConvImageCompute::Conv2d7x7opt() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims(); auto input_dims = param.x->dims();
auto paddings = *param.paddings; auto paddings = *param.paddings;
...@@ -1302,23 +1293,6 @@ void ConvImageCompute::Conv2d7x7opt() { ...@@ -1302,23 +1293,6 @@ void ConvImageCompute::Conv2d7x7opt() {
const bool is_element_wise_bias = const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims(); has_bias && param.output->dims() == param.bias->dims();
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 5;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
// default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
// default_work_size[2] = h_blk;
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d 7x7 params ============"; VLOG(4) << "============ conv2d 7x7 params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
...@@ -1336,9 +1310,6 @@ void ConvImageCompute::Conv2d7x7opt() { ...@@ -1336,9 +1310,6 @@ void ConvImageCompute::Conv2d7x7opt() {
VLOG(4) << "strides: " << strides[0] << "," << strides[1]; VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif #endif
CHECK_GE(dilations.size(), 2); CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]); CHECK(dilations[0] == dilations[1]);
...@@ -1353,24 +1324,15 @@ void ConvImageCompute::Conv2d7x7opt() { ...@@ -1353,24 +1324,15 @@ void ConvImageCompute::Conv2d7x7opt() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>(); bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
} }
auto& context = ctx_->As<OpenCLContext>(); auto kernel = kernel_;
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
#endif
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, h_blk); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image); status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -1403,39 +1365,19 @@ void ConvImageCompute::Conv2d7x7opt() { ...@@ -1403,39 +1365,19 @@ void ConvImageCompute::Conv2d7x7opt() {
status = kernel.setArg(++arg_idx, output_height); status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(w_blk),
static_cast<size_t>(h_blk)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
local_work_size, local_work_size_,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
} }
void ConvImageCompute::DepthwiseConv2d3x3s1() { void ConvImageCompute::DepthwiseConv2d3x3s1() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto x_dims = param.x->dims(); auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims(); auto filter_dims = param.filter->dims();
...@@ -1444,8 +1386,6 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { ...@@ -1444,8 +1386,6 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
auto strides = param.strides; auto strides = param.strides;
auto dilations = *param.dilations; auto dilations = *param.dilations;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* input_img = param.x->data<half_t, cl::Image2D>(); auto* input_img = param.x->data<half_t, cl::Image2D>();
auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>(); auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
...@@ -1459,26 +1399,15 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { ...@@ -1459,26 +1399,15 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
auto* output_img = param.output->mutable_data<half_t, cl::Image2D>( auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]); image_shape["width"], image_shape["height"]);
STL::stringstream kernel_key; auto kernel = kernel_;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
int w_blk_size = 2;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
auto global_work_size = cl::NDRange(c_block, w_blk, nh);
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, static_cast<const int>(c_block)); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk)); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(nh)); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_img); status = kernel.setArg(++arg_idx, *input_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -1516,28 +1445,11 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { ...@@ -1516,28 +1445,11 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2])); status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
local_work_size, local_work_size_,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -1545,6 +1457,8 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { ...@@ -1545,6 +1457,8 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
} }
void ConvImageCompute::DepthwiseConv2d3x3() { void ConvImageCompute::DepthwiseConv2d3x3() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto x_dims = param.x->dims(); auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims(); auto filter_dims = param.filter->dims();
...@@ -1555,8 +1469,6 @@ void ConvImageCompute::DepthwiseConv2d3x3() { ...@@ -1555,8 +1469,6 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
int offset = filter_dims[2] / 2 - paddings[0]; int offset = filter_dims[2] / 2 - paddings[0];
int input_c_block = (x_dims[1] + 3) / 4; int input_c_block = (x_dims[1] + 3) / 4;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* input_img = param.x->data<half_t, cl::Image2D>(); auto* input_img = param.x->data<half_t, cl::Image2D>();
auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>(); auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
...@@ -1570,21 +1482,10 @@ void ConvImageCompute::DepthwiseConv2d3x3() { ...@@ -1570,21 +1482,10 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
auto* output_img = param.output->mutable_data<half_t, cl::Image2D>( auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]); image_shape["width"], image_shape["height"]);
STL::stringstream kernel_key; auto kernel = kernel_;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
auto global_work_size = cl::NDRange(c_block, w, nh);
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "setArg"; VLOG(4) << "setArg";
VLOG(4) << "c_block = " << c_block;
VLOG(4) << "w = " << w;
VLOG(4) << "nh = " << nh;
VLOG(4) << "strides = " << strides[0]; VLOG(4) << "strides = " << strides[0];
VLOG(4) << "offset = " << offset; VLOG(4) << "offset = " << offset;
VLOG(4) << "dilations = " << dilations[0]; VLOG(4) << "dilations = " << dilations[0];
...@@ -1597,11 +1498,11 @@ void ConvImageCompute::DepthwiseConv2d3x3() { ...@@ -1597,11 +1498,11 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, static_cast<const int>(c_block)); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(w)); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(nh)); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_img); status = kernel.setArg(++arg_idx, *input_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -1641,7 +1542,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() { ...@@ -1641,7 +1542,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
...@@ -1650,6 +1551,8 @@ void ConvImageCompute::DepthwiseConv2d3x3() { ...@@ -1650,6 +1551,8 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
} }
void ConvImageCompute::DepthwiseConv2d() { void ConvImageCompute::DepthwiseConv2d() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims(); auto input_dims = param.x->dims();
auto paddings = *param.paddings; auto paddings = *param.paddings;
...@@ -1681,16 +1584,6 @@ void ConvImageCompute::DepthwiseConv2d() { ...@@ -1681,16 +1584,6 @@ void ConvImageCompute::DepthwiseConv2d() {
int input_c = input_dims[1]; int input_c = input_dims[1];
auto dilations = *param.dilations; auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ depthwise conv2d params ============"; VLOG(4) << "============ depthwise conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
...@@ -1710,9 +1603,6 @@ void ConvImageCompute::DepthwiseConv2d() { ...@@ -1710,9 +1603,6 @@ void ConvImageCompute::DepthwiseConv2d() {
VLOG(4) << "offset: " << offset; VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size(); VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif #endif
CHECK_GE(dilations.size(), 2); CHECK_GE(dilations.size(), 2);
...@@ -1730,25 +1620,15 @@ void ConvImageCompute::DepthwiseConv2d() { ...@@ -1730,25 +1620,15 @@ void ConvImageCompute::DepthwiseConv2d() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>(); bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
} }
auto& context = ctx_->As<OpenCLContext>(); auto kernel = kernel_;
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
cl_int status; cl_int status;
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block); status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w); status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh); status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image); status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -1786,21 +1666,16 @@ void ConvImageCompute::DepthwiseConv2d() { ...@@ -1786,21 +1666,16 @@ void ConvImageCompute::DepthwiseConv2d() {
status = kernel.setArg(++arg_idx, filter_height); status = kernel.setArg(++arg_idx, filter_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image; // VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}"; << global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif #endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
...@@ -1809,7 +1684,7 @@ void ConvImageCompute::DepthwiseConv2d() { ...@@ -1809,7 +1684,7 @@ void ConvImageCompute::DepthwiseConv2d() {
} }
void ConvImageCompute::Run() { (this->*impl_)(); } void ConvImageCompute::Run() { (this->*impl_)(); }
#undef PROFILE_CONV_KERNEL
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -59,6 +59,19 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -59,6 +59,19 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_; Tensor filter_gpu_image_;
Tensor bias_gpu_image_; Tensor bias_gpu_image_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
int c_blk_ = 1;
int w_blk_ = 1;
int nh_blk_ = 1;
int default_c_blk_ = 1;
int default_w_blk_ = 1;
int default_nh_blk_ = 1;
cl::Kernel kernel_;
cl::NDRange local_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
bool use_lws{true}; bool use_lws{true};
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册