提交 8f5e912e 编写于 作者: X xiebaiyuan 提交者: GitHub

[LITE][OPENCL][Image] mv kernel init gws lws into prepare for run (#3285)

* [LITE][OPENCL][Image] mv kernel init gws lws into prepare for run, test=develop

* [LITE][OPENCL][Image] shut down profile, test=develop

* [LITE][OPENCL][Image] move log ahead, test=develop
上级 e3cf724e
......@@ -14,8 +14,8 @@
#include "lite/kernels/opencl/conv_image_compute.h"
#include <iomanip>
#include <sstream>
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
......@@ -78,9 +78,27 @@ void ConvImageCompute::PrepareForRun() {
VLOG(3) << "dilation_equal:" << dilation_equal;
VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
<< paddings[2] << " " << paddings[3];
CHECK(pad_equal && stride_equal && dilation_equal);
// general gws..
auto out_image_shape = InitImageDimInfoWith(output_dims);
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
default_c_blk_ = default_work_size[0];
default_w_blk_ = default_work_size[1];
default_nh_blk_ = default_work_size[2];
c_blk_ = default_c_blk_;
w_blk_ = default_w_blk_;
nh_blk_ = default_nh_blk_;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
if (kernel_h == 1 && kernel_w == 1) {
// conv2d_1x1
if (param.x->dims()[1] % 4 == 0) {
......@@ -99,6 +117,15 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d1x1opt;
{
// calc 1x1 gws
w_blk_ = maptofactor(default_w_blk_, 4);
c_blk_ = default_c_blk_;
nh_blk_ = default_nh_blk_;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#define DEPTH_CONV_USE_SPL
#ifdef DEPTH_CONV_USE_SPL
} else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
......@@ -107,9 +134,38 @@ void ConvImageCompute::PrepareForRun() {
if (stride_h == 1 && dilations[0] == 1) {
kernel_func_names_.push_back("depth_conv2d_3x3s1");
impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
{
// depthwise spl gws s1
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
int w_blk_size = 2;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
c_blk_ = c_block;
w_blk_ = w_blk;
nh_blk_ = nh;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
} else {
kernel_func_names_.push_back("depth_conv2d_3x3");
impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
{
// depthwise spl gws
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
c_blk_ = c_block;
w_blk_ = w;
nh_blk_ = nh;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
}
kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
......@@ -157,6 +213,22 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d3x3opt;
{
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
} else if (kernel_h == 5 && kernel_w == 5) {
#define CONV_5x5_OPT
#ifndef CONV_5x5_OPT
......@@ -189,6 +261,21 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d5x5opt;
{
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#endif
#undef CONV_5x5_OPT
} else if (kernel_h == 7 && kernel_w == 7) {
......@@ -223,6 +310,21 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d7x7opt;
{
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#endif
#undef CONV_7x7_OPT
......@@ -270,9 +372,36 @@ void ConvImageCompute::PrepareForRun() {
context.cl_context()->AddKernel(
kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
}
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
size_t max_work_group_size = 0;
kernel_.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
VLOG(4) << "max_work_group_size: " << max_work_group_size;
if (max_work_group_size > 0 && use_lws) {
// local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1(
// global_work_size_, max_work_group_size);
local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_,
max_work_group_size);
VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
<< local_work_size_[1] << "," << local_work_size_[2] << "}";
}
}
void ConvImageCompute::Conv2d1x1opt() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
......@@ -302,16 +431,28 @@ void ConvImageCompute::Conv2d1x1opt() {
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
// const std::vector<size_t>& default_work_size =
// DefaultWorkSize(output_dims,
// DDim(std::vector<DDim::value_type>{
// static_cast<int64_t>(out_image_shape["width"]),
// static_cast<int64_t>(out_image_shape["height"])}));
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// int maped_w = maptofactor(w, 4);
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
// auto global_work_size_ =
// cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
// static_cast<size_t>(maped_w),
// static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d_1x1 params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
......@@ -331,9 +472,9 @@ void ConvImageCompute::Conv2d1x1opt() {
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
// VLOG(4) << "default work size{c_block, w, nh}: "
// << "{" << c_block << ", " << w << ", " << nh << ""
// << "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
......@@ -350,27 +491,14 @@ void ConvImageCompute::Conv2d1x1opt() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int maped_w = maptofactor(w, 4);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "maped_w: " << maped_w;
VLOG(4) << "hasbias: " << has_bias;
#endif
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, maped_w);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
......@@ -401,49 +529,87 @@ void ConvImageCompute::Conv2d1x1opt() {
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
status = kernel.setArg(++arg_idx, default_w_blk_);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(maped_w),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
local_work_size,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
#ifdef PROFILE_CONV_KERNEL
bool use_profile = false;
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
double start = GetCurrentUS();
if (use_profile) {
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
} else {
int count = 50;
double sumtime = 0;
if (!use_profile) {
count = 1;
}
for (size_t i = 0; i < count; i++) {
start = GetCurrentUS();
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (use_profile) {
event_->wait();
double duration = GetCurrentUS() - start;
sumtime += duration;
}
}
auto dims_string = [](DDimLite dims) -> std::string {
std::ostringstream stream;
stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << ","
<< dims[3] << "]";
return stream.str();
};
if (use_profile) {
// LOG(INFO) << "input: " << input_dims;
// LOG(INFO) << "filter: " << filter_dims;
// LOG(INFO) << "output: " << output_dims;
std::cout << std::setw(25) << std::left << dims_string(input_dims)
<< std::setw(25) << std::left << dims_string(filter_dims)
<< std::setw(25) << std::left << dims_string(output_dims)
<< std::setw(25) << std::left << sumtime / count << std::endl;
} else {
dims_string(input_dims);
}
}
#endif
}
void ConvImageCompute::Conv2d3x3() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
......@@ -486,24 +652,14 @@ void ConvImageCompute::Conv2d3x3() {
} else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
new_groups = input_channel / filter_channel;
}
/* TODO(ysh329): mobile has no case below
else {
LOG(FATAL) << "Not support conv3x3 case with"
<< " input_dims:" << input_dims << " output_dims:" <<
output_dims
<< " filter_dims:" << filter_dims;
}
*/
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
/* TODO(ysh329): mobile has no case below
else {
LOG(FATAL) << "Not support conv3x3 case with"
<< " input_dims:" << input_dims << " output_dims:" <<
output_dims
<< " filter_dims:" << filter_dims;
}
*/
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
......@@ -527,9 +683,9 @@ void ConvImageCompute::Conv2d3x3() {
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "param.groups(groups):" << param.groups;
VLOG(4) << "new_groups:" << new_groups;
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
// VLOG(4) << "default work size{c_block, w, nh}: "
// << "{" << c_block << ", " << w << ", " << nh << ""
// << "}";
#endif
CHECK_GE(dilations.size(), 2);
......@@ -544,26 +700,15 @@ void ConvImageCompute::Conv2d3x3() {
if (has_bias) {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
......@@ -607,21 +752,16 @@ void ConvImageCompute::Conv2d3x3() {
status = kernel.setArg(++arg_idx, new_groups);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......@@ -630,6 +770,8 @@ void ConvImageCompute::Conv2d3x3() {
}
void ConvImageCompute::Conv2d3x3opt() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
......@@ -657,24 +799,6 @@ void ConvImageCompute::Conv2d3x3opt() {
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 5;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
// default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
// default_work_size[2] = h_blk;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
......@@ -692,9 +816,6 @@ void ConvImageCompute::Conv2d3x3opt() {
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
......@@ -710,24 +831,15 @@ void ConvImageCompute::Conv2d3x3opt() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
#endif
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, h_blk);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
......@@ -763,38 +875,17 @@ void ConvImageCompute::Conv2d3x3opt() {
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(w_blk),
static_cast<size_t>(h_blk)};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
local_work_size,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
......@@ -802,6 +893,8 @@ void ConvImageCompute::Conv2d3x3opt() {
}
void ConvImageCompute::Conv2d5x5() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
......@@ -833,16 +926,6 @@ void ConvImageCompute::Conv2d5x5() {
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
......@@ -863,9 +946,6 @@ void ConvImageCompute::Conv2d5x5() {
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
......@@ -881,25 +961,15 @@ void ConvImageCompute::Conv2d5x5() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
......@@ -933,21 +1003,16 @@ void ConvImageCompute::Conv2d5x5() {
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......@@ -956,6 +1021,8 @@ void ConvImageCompute::Conv2d5x5() {
}
void ConvImageCompute::Conv2d5x5opt() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
......@@ -984,22 +1051,6 @@ void ConvImageCompute::Conv2d5x5opt() {
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 5;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
// default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
// default_work_size[2] = h_blk;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
......@@ -1018,9 +1069,6 @@ void ConvImageCompute::Conv2d5x5opt() {
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
......@@ -1035,22 +1083,14 @@ void ConvImageCompute::Conv2d5x5opt() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
#endif
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, h_blk);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
......@@ -1083,38 +1123,13 @@ void ConvImageCompute::Conv2d5x5opt() {
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(w_blk),
static_cast<size_t>(h_blk)};
// VLOG(4) << "out_image: " << out_image;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
// VLOG(4) << "out_image: " << out_image;
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
local_work_size,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
......@@ -1122,6 +1137,8 @@ void ConvImageCompute::Conv2d5x5opt() {
}
void ConvImageCompute::Conv2d7x7() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
......@@ -1153,16 +1170,6 @@ void ConvImageCompute::Conv2d7x7() {
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
......@@ -1183,9 +1190,6 @@ void ConvImageCompute::Conv2d7x7() {
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
......@@ -1201,25 +1205,15 @@ void ConvImageCompute::Conv2d7x7() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
......@@ -1253,21 +1247,16 @@ void ConvImageCompute::Conv2d7x7() {
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......@@ -1275,6 +1264,8 @@ void ConvImageCompute::Conv2d7x7() {
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d7x7opt() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
......@@ -1302,23 +1293,6 @@ void ConvImageCompute::Conv2d7x7opt() {
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 5;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
// default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
// default_work_size[2] = h_blk;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d 7x7 params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
......@@ -1336,9 +1310,6 @@ void ConvImageCompute::Conv2d7x7opt() {
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
......@@ -1353,24 +1324,15 @@ void ConvImageCompute::Conv2d7x7opt() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
#endif
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, h_blk);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
......@@ -1403,39 +1365,19 @@ void ConvImageCompute::Conv2d7x7opt() {
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(w_blk),
static_cast<size_t>(h_blk)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
local_work_size,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::DepthwiseConv2d3x3s1() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims();
......@@ -1444,8 +1386,6 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
auto strides = param.strides;
auto dilations = *param.dilations;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* input_img = param.x->data<half_t, cl::Image2D>();
auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
......@@ -1459,26 +1399,15 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
int w_blk_size = 2;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
auto global_work_size = cl::NDRange(c_block, w_blk, nh);
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_img);
CL_CHECK_FATAL(status);
......@@ -1516,28 +1445,11 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
CL_CHECK_FATAL(status);
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
local_work_size,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
......@@ -1545,6 +1457,8 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
}
void ConvImageCompute::DepthwiseConv2d3x3() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims();
......@@ -1555,8 +1469,6 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
int offset = filter_dims[2] / 2 - paddings[0];
int input_c_block = (x_dims[1] + 3) / 4;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* input_img = param.x->data<half_t, cl::Image2D>();
auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
......@@ -1570,21 +1482,10 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
auto global_work_size = cl::NDRange(c_block, w, nh);
auto kernel = kernel_;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "setArg";
VLOG(4) << "c_block = " << c_block;
VLOG(4) << "w = " << w;
VLOG(4) << "nh = " << nh;
VLOG(4) << "strides = " << strides[0];
VLOG(4) << "offset = " << offset;
VLOG(4) << "dilations = " << dilations[0];
......@@ -1597,11 +1498,11 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(w));
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_img);
CL_CHECK_FATAL(status);
......@@ -1641,7 +1542,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......@@ -1650,6 +1551,8 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
}
void ConvImageCompute::DepthwiseConv2d() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
......@@ -1681,16 +1584,6 @@ void ConvImageCompute::DepthwiseConv2d() {
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ depthwise conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
......@@ -1710,9 +1603,6 @@ void ConvImageCompute::DepthwiseConv2d() {
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
......@@ -1730,25 +1620,15 @@ void ConvImageCompute::DepthwiseConv2d() {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
......@@ -1786,21 +1666,16 @@ void ConvImageCompute::DepthwiseConv2d() {
status = kernel.setArg(++arg_idx, filter_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......@@ -1809,7 +1684,7 @@ void ConvImageCompute::DepthwiseConv2d() {
}
void ConvImageCompute::Run() { (this->*impl_)(); }
#undef PROFILE_CONV_KERNEL
} // namespace opencl
} // namespace kernels
} // namespace lite
......
......@@ -59,6 +59,19 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
int c_blk_ = 1;
int w_blk_ = 1;
int nh_blk_ = 1;
int default_c_blk_ = 1;
int default_w_blk_ = 1;
int default_nh_blk_ = 1;
cl::Kernel kernel_;
cl::NDRange local_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
bool use_lws{true};
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册