未验证 提交 d341fccb 编写于 作者: Y ysh329 提交者: GitHub

[OPENCL] remove conv redundant's for opencl kernel. test=develop (#3924)

remove conv redundant's for opencl kernel.
上级 4780849f
......@@ -119,7 +119,7 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
}
}
cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
......@@ -157,7 +157,7 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
static_cast<size_t>(gws0)};
#endif
}
cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
......
......@@ -62,10 +62,10 @@ class CLContext {
cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
bool IsArmMali();
......
......@@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt(
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......@@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple(
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......
......@@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
......
......@@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......@@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......
......@@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......
......@@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......@@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......@@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
\ No newline at end of file
}
......@@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......
......@@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......@@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......@@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
\ No newline at end of file
}
......@@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
......
......@@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3(
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
......@@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
__private const int ou_nh,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
......
......@@ -30,92 +30,81 @@ namespace kernels {
namespace opencl {
void ConvImageCompute::PrepareForRun() {
const auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
ReInitWhenNeeded();
auto filter_dims = conv_param_->filter->dims();
filter_tensor_n_ = filter_dims[0];
filter_tensor_c_ = filter_dims[1];
filter_tensor_h_ = filter_dims[2];
filter_tensor_w_ = filter_dims[3];
float* filter_cpu = param.filter->mutable_data<float>();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const bool is_mali = context.cl_context()->IsArmMali();
filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
int bs = x_dims[0];
int c_in = x_dims[1];
int h_out = output_dims[2];
int w_out = output_dims[3];
int kernel_h = filter_dims[2]; // oihw
int kernel_w = filter_dims[3];
auto paddings = *param.paddings;
auto dilations = *param.dilations;
int stride_h = param.strides[0];
int stride_w = param.strides[1];
int pad_h = paddings[0];
int pad_w = paddings[2];
int groups = param.groups;
bool relu_fused = param.fuse_relu;
bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
bool zero_pad = (pad_h == 0) && (pad_w == 0);
bool pad_equal =
((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) &&
(paddings[2] == paddings[3]));
bool stride_equal = stride_h == stride_w;
bool dilation_equal = dilations[0] == dilations[1];
auto paddings = *conv_param_->paddings;
pad_up_ = paddings[0];
pad_down_ = paddings[1];
pad_left_ = paddings[2];
pad_right_ = paddings[3];
auto dilations = *conv_param_->dilations;
dilation_h_ = dilations[0];
dilation_w_ = dilations[1];
stride_h_ = conv_param_->strides[0];
stride_w_ = conv_param_->strides[1];
groups_ = conv_param_->groups;
relu_fused_ = conv_param_->fuse_relu;
has_bias_ = (conv_param_->bias) != nullptr;
offset_ = filter_tensor_h_ / 2 - pad_up_;
bool pad_equal = ((pad_left_ == pad_up_) && (pad_up_ == pad_left_) &&
(pad_left_ == pad_right_));
bool stride_equal = stride_h_ == stride_w_;
bool dilation_equal = dilation_h_ == dilation_w_;
VLOG(3) << "Is arm mali / " << (is_mali ? "Yes" : "No");
VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
<< " stride_w:" << stride_w << " pad_h:" << pad_h
<< " pad_w:" << pad_w << " kernel_h:" << kernel_h
<< " kernel_h:" << kernel_h;
VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
<< " " << x_dims[3];
VLOG(3) << "dialtion:" << dilations[0] << " " << dilations[1];
VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
<< output_dims[2] << " " << output_dims[3];
VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
<< filter_dims[2] << " " << filter_dims[3];
VLOG(3) << "Is relu fused? / " << (relu_fused_ ? "Yes" : "No");
VLOG(3) << "groups:" << groups_ << " stride_h_:" << stride_h_
<< " stride_w_:" << stride_w_ << " pad_left_:" << pad_left_
<< " pad_up_:" << pad_up_ << " filter_tensor_h_:" << filter_tensor_h_
<< " filter_tensor_h_:" << filter_tensor_h_;
VLOG(3) << "input_tensor_nchw:" << input_tensor_n_ << " " << input_tensor_c_
<< " " << input_tensor_h_ << " " << input_tensor_w_;
VLOG(3) << "dialtion:" << dilation_h_ << " " << dilation_w_;
VLOG(3) << "output_dims:" << output_tensor_n_ << " " << output_tensor_c_
<< " " << output_tensor_h_ << " " << output_tensor_w_;
VLOG(3) << "filter_dims:" << filter_tensor_n_ << " " << filter_tensor_c_
<< " " << filter_tensor_h_ << " " << filter_tensor_w_;
VLOG(3) << "pad_equal:" << pad_equal;
VLOG(3) << "stride_equal:" << stride_equal;
VLOG(3) << "dilation_equal:" << dilation_equal;
VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
<< paddings[2] << " " << paddings[3];
VLOG(3) << "padding :" << pad_up_ << " " << pad_down_ << " " << pad_left_
<< " " << pad_right_;
CHECK(pad_equal && stride_equal && dilation_equal);
CHECK_GE(conv_param_->dilations->size(), 2);
CHECK(dilation_h_ == dilation_w_);
CHECK_GE(conv_param_->paddings->size(), 2);
CHECK(pad_left_ == pad_up_);
CHECK_GE(conv_param_->strides.size(), 2);
CHECK(stride_h_ == stride_w_);
if (!is_mali) {
use_turn_ = false;
use_tune_ = false;
}
// general gws..
auto out_image_shape = InitImageDimInfoWith(output_dims);
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
default_c_blk_ = default_work_size[0];
default_w_blk_ = default_work_size[1];
default_nh_blk_ = default_work_size[2];
c_blk_ = default_c_blk_;
w_blk_ = default_w_blk_;
nh_blk_ = default_nh_blk_;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
if (kernel_h == 1 && kernel_w == 1) {
// conv2d_1x1
// if (param.x->dims()[1] % 4 == 0) {
// kernel_func_names_.push_back("conv2d_1x1_simple");
// } else {
// kernel_func_names_.push_back("conv2d_1x1_opt");
// }
/*********************************************
* Upload filter, bias to opencl device
*********************************************/
float* filter_cpu = conv_param_->filter->mutable_data<float>();
filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
if (param.x->dims()[1] % 4 == 0) {
if (filter_tensor_h_ == 1 && filter_tensor_h_ == 1) {
if (input_tensor_c_ % 4 == 0) {
kernel_func_names_.push_back("conv2d_1x1_simple");
} else {
kernel_func_names_.push_back("conv2d_1x1_opt");
......@@ -124,89 +113,49 @@ void ConvImageCompute::PrepareForRun() {
CLImageConverterNWBlock converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
// std::vector<half_t> filter_image_v(filter_image_dims[0] *
// filter_image_dims[1] * 4); // 4 :
// RGBA
tensor_hold_filter_image_->Resize(
{1, filter_image_dims[0], filter_image_dims[1], 4});
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_data);
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d1x1opt;
{
// calc 1x1 gws
w_blk_ = maptofactor(default_w_blk_, 4);
c_blk_ = default_c_blk_;
nh_blk_ = default_nh_blk_;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#define DEPTH_CONV_USE_SPL
#ifdef DEPTH_CONV_USE_SPL
} else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
kernel_h == 3 && kernel_w == 3 && groups > 1) {
} else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ &&
filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1) {
// depth_conv2d_3x3s1, depth_conv2d_3x3
if (stride_h == 1 && dilations[0] == 1) {
if (stride_h_ == 1 && dilation_h_ == 1) {
kernel_func_names_.push_back("depth_conv2d_3x3s1");
impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
{
// depthwise spl gws s1
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
int w_blk_size = 2;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
c_blk_ = c_block;
w_blk_ = w_blk;
nh_blk_ = nh;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
} else {
kernel_func_names_.push_back("depth_conv2d_3x3");
impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
{
// depthwise spl gws
int c_block = (output_dims[1] + 3) / 4;
int w = output_dims[3];
int nh = output_dims[0] * output_dims[2];
c_blk_ = c_block;
w_blk_ = w;
nh_blk_ = nh;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
}
kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
CLImageConverterNWBlock converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
tensor_hold_filter_image_->Resize(
{1, filter_image_dims[0], filter_image_dims[1], 4});
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_data);
filter_image_w_, filter_image_h_, filter_image_data);
#endif
} else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]
} else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_
#ifdef DEPTH_CONV_USE_SPL
&&
kernel_h != 3
filter_tensor_h_ != 3
#endif
#undef DEPTH_CONV_USE_SPL
) {
......@@ -216,75 +165,61 @@ void ConvImageCompute::PrepareForRun() {
CLImageConverterNWBlock converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
tensor_hold_filter_image_->Resize(
{1, filter_image_dims[0], filter_image_dims[1], 4});
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_data);
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::DepthwiseConv2d;
} else if (kernel_w == 3 && kernel_h == 3) {
} else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
// #define CONV3x3OPT_FALL_BACK
#ifndef CONV3x3OPT_FALL_BACK
// conv2d_3x3
kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch"
: "conv2d_3x3_opt");
kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch"
: "conv2d_3x3_opt");
kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
tensor_hold_filter_image_->Resize(
{1, filter_image_dims[0], filter_image_dims[1], 4});
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_data);
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3opt;
{
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#else
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
tensor_hold_filter_image_->Resize(
{1, filter_image_dims[0], filter_image_dims[1], 4});
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_data);
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3;
#endif
#undef CONV3x3OPT_FALL_BACK
} else if (kernel_h == 5 && kernel_w == 5) {
} else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
#define CONV_5x5_OPT
#ifndef CONV_5x5_OPT
// conv2d_5x5
......@@ -293,55 +228,42 @@ void ConvImageCompute::PrepareForRun() {
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
tensor_hold_filter_image_->Resize(
{1, filter_image_dims[0], filter_image_dims[1], 4});
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_data);
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d5x5;
#else
// conv2d_5x5_opt
kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch"
: "conv2d_5x5_opt");
kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_5x5_multi_batch"
: "conv2d_5x5_opt");
kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
tensor_hold_filter_image_->Resize(
{1, filter_image_dims[0], filter_image_dims[1], 4});
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_data);
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d5x5opt;
{
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#endif
#undef CONV_5x5_OPT
} else if (kernel_h == 7 && kernel_w == 7) {
} else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7) {
#define CONV_7x7_OPT
#ifndef CONV_7x7_OPT
// conv2d_7x7
......@@ -350,52 +272,39 @@ void ConvImageCompute::PrepareForRun() {
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
tensor_hold_filter_image_->Resize(
{1, filter_image_dims[0], filter_image_dims[1], 4});
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_data);
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d7x7;
#else
// conv2d_7x7
kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch"
: "conv2d_7x7_opt");
kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_7x7_multi_batch"
: "conv2d_7x7_opt");
kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
tensor_hold_filter_image_->Resize(
{1, filter_image_dims[0], filter_image_dims[1], 4});
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_data);
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d7x7opt;
{
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
#endif
#undef CONV_7x7_OPT
} else {
......@@ -407,30 +316,30 @@ void ConvImageCompute::PrepareForRun() {
// build options
std::string build_options_single(" -DCL_DTYPE_half");
// relu options
VLOG(3) << "relu_fused:" << relu_fused
<< " param.activation_param.active_type:"
<< static_cast<int>(param.activation_param.active_type)
<< " param.activation_param.has_active:"
<< param.activation_param.has_active;
if (param.activation_param.has_active) {
if (param.activation_param.active_type ==
lite_api::ActivationType::kRelu) { // Note: judge using `relu_fused`
VLOG(3) << "relu_fused_:" << relu_fused_
<< " conv_param_->activation_param.active_type:"
<< static_cast<int>(conv_param_->activation_param.active_type)
<< " conv_param_->activation_param.has_active:"
<< conv_param_->activation_param.has_active;
if (conv_param_->activation_param.has_active) {
if (conv_param_->activation_param.active_type ==
lite_api::ActivationType::kRelu) { // Note: judge using `relu_fused_`
// also is ok
build_options_single += " -DRELU";
} else if (param.activation_param.active_type ==
} else if (conv_param_->activation_param.active_type ==
lite_api::ActivationType::kRelu6) {
build_options_single += " -DRELU6";
} else {
LOG(FATAL) << "Unsupported activation type:"
<< static_cast<int>(param.activation_param.active_type);
<< static_cast<int>(conv_param_->activation_param.active_type);
}
}
GetGlobalWorkSize();
// bias options
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
if (has_bias) {
has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims();
if (has_bias_) {
bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
build_options_single +=
is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
......@@ -438,21 +347,36 @@ void ConvImageCompute::PrepareForRun() {
// convert cpu buffer bias --> gpu image
CLImageConverterFolder bias_converter;
const DDim& bias_image_dims =
bias_converter.InitImageDimInfoWith(param.bias->dims());
bias_converter.InitImageDimInfoWith(conv_param_->bias->dims());
bias_image_h_ = bias_image_dims[1];
bias_image_w_ = bias_image_dims[0];
tensor_hold_bias_image_->Resize(
{1, bias_image_dims[0], bias_image_dims[1], 4});
half_t* bias_image_data = tensor_hold_bias_image_->mutable_data<half_t>();
float* bias_cpu_data = param.bias->mutable_data<float>();
float* bias_cpu_data = conv_param_->bias->mutable_data<float>();
bias_converter.NCHWToImage(
bias_cpu_data, bias_image_data, param.bias->dims());
bias_cpu_data, bias_image_data, conv_param_->bias->dims());
this->bias_gpu_image_->mutable_data<half_t, cl::Image2D>(
bias_image_dims[0], bias_image_dims[1], bias_image_data);
// convert cpu buffer bias --> gpu image --- end ----
} else {
bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
CLImageConverterFolder bias_converter;
tensor_hold_bias_image_->Resize({1, 1, 1, 4});
half_t* bias_image_data = tensor_hold_bias_image_->mutable_data<half_t>();
this->bias_gpu_image_->mutable_data<half_t, cl::Image2D>(
1, 1, bias_image_data);
}
// define image pointer for filter, bias
input_image_p_ = conv_param_->x->data<half_t, cl::Image2D>();
filter_image_p_ = filter_gpu_image_->data<half_t, cl::Image2D>();
bias_image_p_ = bias_gpu_image_->data<half_t, cl::Image2D>();
output_image_p_ = conv_param_->output->mutable_data<half_t, cl::Image2D>(
output_image_w_, output_image_h_);
build_options_.push_back(build_options_single);
for (size_t i = 0; i < kernel_func_names_.size(); i++) {
......@@ -478,55 +402,55 @@ void ConvImageCompute::PrepareForRun() {
VLOG(4) << "max_work_group_size: " << max_work_group_size;
if (max_work_group_size > 0 && use_lws_) {
double min_turn_time = DBL_MAX;
double min_tune_time = DBL_MAX;
cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
global_work_size_, max_work_group_size);
VLOG(3) << "origin :local_work_size_ : " << best_local_work_size[0] << " "
<< best_local_work_size[1] << " " << best_local_work_size[2];
cl::NDRange last_local_work_size = cl::NDRange{
static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
if (use_turn_) {
if (use_tune_) {
for (size_t i = 1; i < 15; i++) {
if (kernel_h == 1 && kernel_w == 1) {
if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
// todo use diff logics
local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
local_work_size_ = context.cl_context()->LocalWorkSizeTune(
global_work_size_, max_work_group_size, i);
} else {
local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
local_work_size_ = context.cl_context()->LocalWorkSizeTune(
global_work_size_, max_work_group_size, i);
}
if (last_local_work_size[0] == local_work_size_[0] &&
last_local_work_size[1] == local_work_size_[1] &&
last_local_work_size[2] == local_work_size_[2]) {
// skiped turned lws
// skiped tuneed lws
continue;
}
auto turn_time = this->Turn(10);
if (min_turn_time > turn_time) {
min_turn_time = turn_time;
auto tune_time = this->Tune(10);
if (min_tune_time > tune_time) {
min_tune_time = tune_time;
best_local_work_size = local_work_size_;
}
last_local_work_size = local_work_size_;
}
// reverse
for (size_t i = 1; i < 15; i++) {
if (kernel_h == 1 && kernel_w == 1) {
if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
// todo use diff logics
local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse(
global_work_size_, max_work_group_size, i);
} else {
local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse(
global_work_size_, max_work_group_size, i);
}
if (last_local_work_size[0] == local_work_size_[0] &&
last_local_work_size[1] == local_work_size_[1] &&
last_local_work_size[2] == local_work_size_[2]) {
// skiped turned lws
// skiped tuneed lws
continue;
}
auto turn_time = this->Turn(10);
if (min_turn_time > turn_time) {
min_turn_time = turn_time;
auto tune_time = this->Tune(10);
if (min_tune_time > tune_time) {
min_tune_time = tune_time;
best_local_work_size = local_work_size_;
}
last_local_work_size = local_work_size_;
......@@ -540,548 +464,316 @@ void ConvImageCompute::PrepareForRun() {
}
}
void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
void ConvImageCompute::ReInitWhenNeeded() {
conv_param_ = param_.get_mutable<param_t>();
auto x_dims = conv_param_->x->dims();
#ifdef LITE_WITH_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
#ifdef LITE_WITH_LOG
VLOG(4) << "============ conv2d_1x1 params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
// VLOG(4) << "input_image: " << input_image;
VLOG(4) << "filter_dims: " << filter_dims;
// VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
// VLOG(4) << "default work size{c_block, w, nh}: "
// << "{" << c_block << ", " << w << ", " << nh << ""
// << "}";
LOG(INFO) << "is_first_epoch_for_run_:" << is_first_epoch_for_run_
<< ", last_input_dims_:" << last_input_dims_
<< ", x_dims:" << x_dims;
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
// handle bias use buffer for channel wise , use image for element wise
const cl::Buffer* bias_buf = nullptr;
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
}
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, default_w_blk_);
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
CLRuntime::Global()->command_queue().finish();
}
}
void ConvImageCompute::Conv2d3x3(bool is_turn) {
auto kernel = kernel_;
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int input_channel = input_dims[1];
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
int filter_channel = filter_dims[1];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
// re-calc group
int new_groups{param.groups};
if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
new_groups = 1;
} else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
new_groups = input_channel / filter_channel;
}
/* TODO(ysh329): mobile has no case below
else {
LOG(FATAL) << "Not support conv3x3 case with"
<< " input_dims:" << input_dims << " output_dims:" <<
output_dims
<< " filter_dims:" << filter_dims;
if (is_first_epoch_for_run_ || last_input_dims_ != x_dims) {
is_first_epoch_for_run_ = false;
last_input_dims_ = x_dims;
input_tensor_n_ = x_dims[0];
input_tensor_c_ = x_dims[1];
input_tensor_h_ = x_dims[2];
input_tensor_w_ = x_dims[3];
auto x_image_shape = InitImageDimInfoWith(x_dims);
input_image_h_ = x_image_shape["height"];
input_image_w_ = x_image_shape["width"];
auto output_dims = conv_param_->output->dims();
output_tensor_n_ = output_dims[0];
output_tensor_c_ = output_dims[1];
output_tensor_h_ = output_dims[2];
output_tensor_w_ = output_dims[3];
auto output_image_shape = InitImageDimInfoWith(output_dims);
output_image_h_ = output_image_shape["height"];
output_image_w_ = output_image_shape["width"];
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
CHECK_GE(conv_param_->x->dims().size(), 4);
CHECK_GE(conv_param_->output->dims().size(), 4);
if (kernel_func_names_.size() > 0 &&
kernel_func_names_[0] == "conv2d_3x3") {
groups_ = conv_param_->groups;
if (filter_tensor_n_ == output_tensor_c_ &&
filter_tensor_c_ == input_tensor_c_) {
groups_ = 1;
} else if (!(filter_tensor_n_ == input_tensor_c_ &&
filter_tensor_c_ == 1)) {
groups_ = input_tensor_c_ / filter_tensor_c_;
}
}
*/
// const std::vector<size_t>& default_work_size =
// DefaultWorkSize(output_dims,
// DDim(std::vector<DDim::value_type>{
// static_cast<int64_t>(out_image_shape["width"]),
// static_cast<int64_t>(out_image_shape["height"])}));
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// VLOG(4) << "============ conv2d params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
// << input_image_shape["height"];
// VLOG(4) << "input_c_block: " << input_c_block;
// VLOG(4) << "input_c: " << input_c;
// VLOG(4) << "input_image: " << input_image;
// VLOG(4) << "input_dims: " << input_dims;
// VLOG(4) << "filter_dims: " << filter_dims;
// VLOG(4) << "filter_image: " << filter_image;
// VLOG(4) << "output_dims: " << output_dims;
// VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
// << out_image_shape["height"];
// VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
// VLOG(4) << "has bias: " << has_bias;
// VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
// VLOG(4) << "strides: " << strides[0] << "," << strides[1];
// VLOG(4) << "offset: " << offset;
// VLOG(4) << "dilations.size : " << dilations.size();
// VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
// VLOG(4) << "param.groups(groups):" << param.groups;
// VLOG(4) << "new_groups:" << new_groups;
// VLOG(4) << "default work size{c_block, w, nh}: "
// << "{" << c_block << ", " << w << ", " << nh << ""
// << "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
// STL::stringstream kernel_key;
// kernel_key << kernel_func_names_[0] << build_options_[0];
// auto kernel = context.cl_context()->GetKernel(kernel_key.str());
// VLOG(4) << "kernel_key: " << kernel_key.str();
// VLOG(4) << "kernel ready ... " << kernel_key.str();
// VLOG(4) << "w: " << w;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
// define image pointer for input, output
input_image_p_ = conv_param_->x->data<half_t, cl::Image2D>();
output_image_p_ = conv_param_->output->mutable_data<half_t, cl::Image2D>(
output_image_w_, output_image_h_);
GetGlobalWorkSize();
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, new_groups);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<int>(input_dims[1]));
CL_CHECK_FATAL(status);
// auto global_work_size =
// cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
// static_cast<size_t>(default_work_size.data()[1]),
// static_cast<size_t>(default_work_size.data()[2])};
// VLOG(4) << "out_image: " << out_image;
// VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
// << global_work_size[1] << "," << global_work_size[2] << "}";
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto dilations = *param.dilations;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int input_channel = input_dims[1];
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
CHECK_EQ(input_dims[0], output_dims[0]);
int batch = input_dims[0];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
#ifdef LITE_WITH_LOG
VLOG(4) << "============ conv2d params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
// << input_image_shape["height"];
// VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
// VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
#endif
void ConvImageCompute::GetGlobalWorkSize() {
if (kernel_func_names_.size() <= 0) return;
// general input_c_block
input_c_block_ = static_cast<int>(input_image_w_ / input_tensor_w_);
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
}
// general gws
auto output_dims = conv_param_->output->dims();
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(output_image_w_),
static_cast<int64_t>(output_image_h_)}));
default_c_blk_ = default_work_size[0];
default_w_blk_ = default_work_size[1];
default_nh_blk_ = default_work_size[2];
c_blk_ = default_c_blk_;
w_blk_ = default_w_blk_;
nh_blk_ = default_nh_blk_;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
#ifdef LITE_WITH_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
if (kernel_func_names_[0] == "conv2d_1x1_simple" ||
kernel_func_names_[0] == "conv2d_1x1_opt") {
w_blk_ = maptofactor(default_w_blk_, 4);
c_blk_ = default_c_blk_;
nh_blk_ = default_nh_blk_;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
} else if (kernel_func_names_[0] == "depth_conv2d_3x3s1") {
// depthwise spl gws s1
int c_block = (output_tensor_c_ + 3) / 4;
int w = output_tensor_w_;
int nh = output_tensor_n_ * output_tensor_h_;
int w_blk_size = 2;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
c_blk_ = c_block;
w_blk_ = w_blk;
nh_blk_ = nh;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
} else if (kernel_func_names_[0] == "depth_conv2d_3x3") {
// depthwise spl gws
int c_block = (output_tensor_c_ + 3) / 4;
int w = output_tensor_w_;
int nh = output_tensor_n_ * output_tensor_h_;
c_blk_ = c_block;
w_blk_ = w;
nh_blk_ = nh;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
} else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
kernel_func_names_[0] == "conv2d_3x3_opt") {
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
} else if (kernel_func_names_[0] == "conv2d_5x5_multi_batch" ||
kernel_func_names_[0] == "conv2d_5x5_opt") {
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
} else if (kernel_func_names_[0] == "conv2d_7x7_multi_batch" ||
kernel_func_names_[0] == "conv2d_7x7_opt") {
int w_blk_size = 5;
int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
int h_blk_size = 1;
int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
c_blk_ = default_c_blk_;
w_blk_ = w_blk;
nh_blk_ = h_blk;
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, paddings[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, batch);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
}
void ConvImageCompute::Conv2d1x1opt(bool enable_tune) {
#ifdef LITE_WITH_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
PrintConvInfo();
#endif
auto& context = ctx_->As<OpenCLContext>();
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, offset_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, input_c_block_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, input_tensor_c_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(14, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(15, output_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(16, default_w_blk_);
CL_CHECK_FATAL(status_);
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status_);
if (enable_tune) {
CLRuntime::Global()->command_queue().finish();
}
}
void ConvImageCompute::Conv2d5x5(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
void ConvImageCompute::Conv2d3x3(bool enable_tune) {
#ifdef LITE_WITH_LOG
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
// VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
// VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
PrintConvInfo();
#endif
auto& context = ctx_->As<OpenCLContext>();
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
}
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, offset_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, input_c_block_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(14, output_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(15, output_tensor_c_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(16, filter_tensor_c_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(17, filter_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(18, filter_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(19, groups_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(20, input_tensor_c_);
CL_CHECK_FATAL(status_);
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status_);
}
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
void ConvImageCompute::Conv2d3x3opt(bool enable_tune) {
#ifdef LITE_WITH_LOG
VLOG(4) << "set bias_image: ";
PrintConvInfo();
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto& context = ctx_->As<OpenCLContext>();
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, pad_left_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, input_tensor_n_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, input_tensor_c_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(14, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(15, output_tensor_h_);
CL_CHECK_FATAL(status_);
#ifdef LITE_WITH_LOG
// VLOG(4) << "out_image: " << out_image;
......@@ -1089,697 +781,406 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) {
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status_);
if (enable_tune) {
CLRuntime::Global()->command_queue().finish();
}
}
void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto dilations = *param.dilations;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int input_channel = input_dims[1];
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
CHECK_EQ(input_dims[0], output_dims[0]);
int batch = input_dims[0];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
// default_work_size[2] = h_blk;
void ConvImageCompute::Conv2d5x5(bool enable_tune) {
#ifdef LITE_WITH_LOG
VLOG(4) << "============ conv2d params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
// << input_image_shape["height"];
// VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
// VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
PrintConvInfo();
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
}
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, paddings[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, batch);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
// VLOG(4) << "out_image: " << out_image;
auto& context = ctx_->As<OpenCLContext>();
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, offset_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, input_c_block_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(14, output_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status_);
if (enable_tune) {
CLRuntime::Global()->command_queue().finish();
}
}
void ConvImageCompute::Conv2d7x7(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
void ConvImageCompute::Conv2d5x5opt(bool enable_tune) {
#ifdef LITE_WITH_LOG
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
// VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
// VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
PrintConvInfo();
#endif
auto& context = ctx_->As<OpenCLContext>();
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, pad_left_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, input_tensor_n_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, input_tensor_c_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(14, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(15, output_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status_);
if (enable_tune) {
CLRuntime::Global()->command_queue().finish();
}
}
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
void ConvImageCompute::Conv2d7x7(bool enable_tune) {
#ifdef LITE_WITH_LOG
VLOG(4) << "set bias_image: ";
PrintConvInfo();
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
auto& context = ctx_->As<OpenCLContext>();
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, offset_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, input_c_block_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, output_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status_);
if (enable_tune) {
CLRuntime::Global()->command_queue().finish();
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
}
void ConvImageCompute::Conv2d7x7opt(bool enable_tune) {
#ifdef LITE_WITH_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
PrintConvInfo();
#endif
auto& context = ctx_->As<OpenCLContext>();
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, pad_left_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, input_tensor_n_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, input_tensor_c_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(14, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(15, output_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status_);
if (enable_tune) {
CLRuntime::Global()->command_queue().finish();
}
}
void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto dilations = *param.dilations;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int input_channel = input_dims[1];
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
CHECK_EQ(input_dims[0], output_dims[0]);
int batch = input_dims[0];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
void ConvImageCompute::DepthwiseConv2d3x3s1(bool enable_tune) {
#ifdef LITE_WITH_LOG
VLOG(4) << "============ conv2d 7x7 params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
// << input_image_shape["height"];
// VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
// VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
PrintConvInfo();
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, paddings[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, batch);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, pad_left_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, input_tensor_c_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(14, output_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status_);
if (enable_tune) {
CLRuntime::Global()->command_queue().finish();
}
}
void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto dilations = *param.dilations;
auto* input_img = param.x->data<half_t, cl::Image2D>();
auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
const cl::Image2D* bias_img = nullptr;
if (param.bias) {
bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
}
auto image_shape = InitImageDimInfoWith(output_dims);
auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_img);
CL_CHECK_FATAL(status);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
void ConvImageCompute::DepthwiseConv2d3x3(bool enable_tune) {
#ifdef LITE_WITH_LOG
VLOG(4) << "set bias_image: ";
PrintConvInfo();
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *output_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
auto& context = ctx_->As<OpenCLContext>();
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, offset_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, input_c_block_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(14, output_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status_);
if (enable_tune) {
CLRuntime::Global()->command_queue().finish();
}
}
void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto x_dims = param.x->dims();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto dilations = *param.dilations;
int offset = filter_dims[2] / 2 - paddings[0];
int input_c_block = (x_dims[1] + 3) / 4;
auto* input_img = param.x->data<half_t, cl::Image2D>();
auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
const cl::Image2D* bias_img = nullptr;
if (param.bias) {
bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
}
auto image_shape = InitImageDimInfoWith(output_dims);
auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
auto kernel = kernel_;
void ConvImageCompute::DepthwiseConv2d(bool enable_tune) {
#ifdef LITE_WITH_LOG
VLOG(4) << "setArg";
VLOG(4) << "strides = " << strides[0];
VLOG(4) << "offset = " << offset;
VLOG(4) << "dilations = " << dilations[0];
VLOG(4) << "input_c_block = " << input_c_block;
VLOG(4) << "x_dims[3] = " << x_dims[3];
VLOG(4) << "x_dims[2] = " << x_dims[2];
VLOG(4) << "output_dims[3] = " << output_dims[3];
VLOG(4) << "output_dims[2] = " << output_dims[2];
PrintConvInfo();
#endif
auto& context = ctx_->As<OpenCLContext>();
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_img);
CL_CHECK_FATAL(status);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
#ifdef LITE_WITH_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *output_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
status_ = kernel_.setArg(0, c_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(1, w_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(2, nh_blk_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(3, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(4, *filter_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(5, *bias_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(6, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(7, stride_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(8, offset_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(9, input_c_block_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(10, dilation_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(11, input_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(12, input_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(13, output_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(14, output_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(15, filter_tensor_w_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(16, filter_tensor_h_);
CL_CHECK_FATAL(status_);
status_ = EnqueueNDRangeKernel(context,
kernel_,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status_);
if (enable_tune) {
CLRuntime::Global()->command_queue().finish();
}
}
void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
void ConvImageCompute::Run() { (this->*impl_)(false); }
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
void ConvImageCompute::PrintConvInfo() {
const bool is_element_wise_bias =
has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims();
#ifdef LITE_WITH_LOG
VLOG(4) << "============ depthwise conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
// VLOG(4) << "input_image: " << input_image;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "input_image_shape: " << input_image_w_ << "," << input_image_h_;
// VLOG(4) << "input_image: " << input_image_p_;
VLOG(4) << "input_dims: " << conv_param_->x->dims();
VLOG(4) << "filter_dims: " << conv_param_->filter->dims();
// VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "output_dims: " << conv_param_->output->dims();
VLOG(4) << "out_image_shape: " << output_image_w_ << ", " << output_image_h_;
VLOG(4) << "paddings: " << pad_left_ << "," << pad_up_;
VLOG(4) << "has bias: " << has_bias_;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
// handle bias use buffer for channel wise , use image for element wise
const cl::Buffer* bias_buf = nullptr;
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
}
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh_blk_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
#ifdef LITE_WITH_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_height);
CL_CHECK_FATAL(status);
#ifdef LITE_WITH_LOG
VLOG(4) << "strides: " << stride_h_ << "," << stride_w_;
VLOG(4) << "offset: ";
VLOG(4) << "dilations.size : " << conv_param_->dilations->size();
VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
void ConvImageCompute::Run() { (this->*impl_)(false); }
double ConvImageCompute::Turn(int times) {
double ConvImageCompute::Tune(int times) {
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
......
......@@ -33,6 +33,7 @@ namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
......@@ -42,8 +43,11 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override;
void ReInitWhenNeeded() override;
void Run() override;
double Turn(int times = 5);
double Tune(int times = 5);
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
......@@ -56,16 +60,20 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
#endif
private:
void Conv2d1x1opt(bool is_turn = false);
void Conv2d3x3(bool is_turn = false);
void Conv2d3x3opt(bool is_turn = false);
void Conv2d5x5(bool is_turn = false);
void Conv2d5x5opt(bool is_turn = false);
void Conv2d7x7(bool is_turn = false);
void Conv2d7x7opt(bool is_turn = false);
void DepthwiseConv2d3x3s1(bool is_turn = false);
void DepthwiseConv2d3x3(bool is_turn = false);
void DepthwiseConv2d(bool is_turn = false);
void PrintConvInfo();
void GetGlobalWorkSize();
void Conv2d1x1opt(bool enable_tune = false);
void Conv2d3x3(bool enable_tune = false);
void Conv2d3x3opt(bool enable_tune = false);
void Conv2d5x5(bool enable_tune = false);
void Conv2d5x5opt(bool enable_tune = false);
void Conv2d7x7(bool enable_tune = false);
void Conv2d7x7opt(bool enable_tune = false);
void DepthwiseConv2d3x3s1(bool enable_tune = false);
void DepthwiseConv2d3x3(bool enable_tune = false);
void DepthwiseConv2d(bool enable_tune = false);
param_t* conv_param_{nullptr};
kernel_t impl_;
std::vector<std::string> kernel_func_names_{};
......@@ -79,19 +87,72 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
std::unique_ptr<Tensor> tensor_hold_bias_image_{nullptr};
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
// opencl kernel args
int c_blk_ = 1;
int w_blk_ = 1;
int nh_blk_ = 1;
const cl::Image2D* input_image_p_{nullptr};
const cl::Image2D* filter_image_p_{nullptr};
const cl::Image2D* bias_image_p_{nullptr};
const cl::Image2D* output_image_p_{nullptr};
int stride_h_{-1};
int stride_w_{-1};
int dilation_h_{-1};
int dilation_w_{-1};
int pad_up_{-1};
int pad_down_{-1};
int pad_left_{-1};
int pad_right_{-1};
int offset_{-1};
int groups_{-1};
bool relu_fused_{false};
bool has_bias_{false};
int input_tensor_n_{-1};
int input_tensor_c_{-1};
int input_tensor_h_{-1};
int input_tensor_w_{-1};
int input_image_h_{-1};
int input_image_w_{-1};
int input_c_block_{-1};
int output_tensor_n_{-1};
int output_tensor_c_{-1};
int output_tensor_h_{-1};
int output_tensor_w_{-1};
int output_image_h_{-1};
int output_image_w_{-1};
int filter_tensor_n_{-1};
int filter_tensor_c_{-1};
int filter_tensor_h_{-1};
int filter_tensor_w_{-1};
int filter_image_h_{-1};
int filter_image_w_{-1};
int bias_image_h_{-1};
int bias_image_w_{-1};
int default_c_blk_ = 1;
int default_w_blk_ = 1;
int default_nh_blk_ = 1;
// =================
DDim last_input_dims_{};
bool is_first_epoch_for_run_{true};
cl::Kernel kernel_;
cl_int status_;
cl::NDRange local_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
bool use_lws_{true};
bool use_turn_{false};
bool use_tune_{false};
};
} // namespace opencl
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册