提交 c046c1aa 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Add opencl image2d conv3x3. test=develop (#2853)

* [LITE][OPENCL] Add opencl image2d conv3x3. test=develop
上级 024c08f9
......@@ -362,6 +362,20 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d1x1;
} else if (kernel_h == 3 && kernel_h == 3) {
// conv2d_3x3
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<float> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<float, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d3x3;
} else if (kernel_h == 5 && kernel_w == 5) {
// conv2d_5x5
kernel_func_names_.push_back("conv2d_5x5");
......@@ -582,6 +596,184 @@ void ConvImageCompute::Conv2d1x1() {
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d3x3() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<float, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int input_channel = input_dims[1];
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
int filter_width = filter_dims[3];
int filter_height = filter_dims[2];
int filter_channel = filter_dims[1];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<float, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
// re-calc group
int new_groups{param.groups};
if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
new_groups = 1;
} else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
new_groups = input_channel / filter_channel;
}
/* TODO(ysh329): mobile has no case below
else {
LOG(FATAL) << "Not support conv3x3 case with"
<< " input_dims:" << input_dims << " output_dims:" <<
output_dims
<< " filter_dims:" << filter_dims;
}
*/
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "param.groups(groups):" << param.groups;
VLOG(4) << "new_groups:" << new_groups;
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<float, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, filter_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, new_groups);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d5x5() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
......@@ -726,6 +918,7 @@ void ConvImageCompute::Conv2d5x5() {
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d7x7() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
......
......@@ -71,6 +71,7 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
private:
void Conv2d1x1();
void Conv2d3x3();
void Conv2d5x5();
void Conv2d7x7();
......
......@@ -446,6 +446,371 @@ TEST(conv2d, compute_image2d_1x1) {
#undef LOOP_TEST
#undef PRINT_RESULT
// #define PRINT_RESULT
// #define LOOP_TEST
TEST(conv2d, compute_image2d_3x3) {
// conv infos
const int ksize = 3;
// int loop_cnt = 0;
#ifdef LOOP_TEST
const int pad = 1;
const int dilation = 1;
const int stride = 2;
const int group = 1;
for (int batch_size = 1; batch_size < 2; ++batch_size) {
for (int oc = 1; oc < 10; oc += 1) { // oc
for (int ih = 5; ih < 9; ih += 1) { // ih
int iw = ih;
for (int ic = 1; ic < 10; ic += 1) { // ic
for (bool bias_flag : {true, false}) {
for (std::string relu_flag : {/*true,*/ "relu"}) {
#else
const int pad = 1;
const int dilation = 1;
#if 0 // small scale with group, but result of cpu reference is wrong
const int stride = 2;
const int group = 2;
const int batch_size = 1;
const int ic = 1;
const int ih = 3;
const int iw = 3;
const int oc = 2;
#else // big scale with group
const int stride = 1;
const int group = 32;
const int batch_size = 1;
const int ic = 32;
const int ih = 112;
const int iw = 112;
const int oc = 32;
#endif
const bool bias_flag = false;
const std::string relu_flag = "relu";
#endif
int filter_channel = ic;
if (group > 1) {
filter_channel = 1;
}
const int oh =
ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
const int ow =
ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
SHADOW_LOG << "to get kernel ...";
auto kernels =
KernelRegistry::Global().Create("conv2d",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
auto kernel = std::move(kernels.front());
SHADOW_LOG << "created conv2d kernel";
SHADOW_LOG << "prepare kernel ------";
lite::Tensor input, filter, bias, output;
operators::ConvParam param;
param.x = &input;
param.filter = &filter;
param.output = &output;
param.groups = group;
if (bias_flag) {
param.bias = &bias;
}
if (relu_flag == "relu") {
param.fuse_relu = true;
} else if (relu_flag == "None") {
param.fuse_relu = false;
} else if (relu_flag == "relu6") {
param.activation_param.Relu_clipped_coef = 6.f;
param.activation_param.has_active = true;
param.activation_param.active_type =
lite_api::ActivationType::kRelu6;
}
std::vector<int> paddings = {pad, pad, pad, pad};
std::vector<int> dilations = {dilation, dilation};
param.paddings = std::make_shared<std::vector<int>>(paddings);
param.dilations = std::make_shared<std::vector<int>>(dilations);
param.strides = std::vector<int>{stride, stride};
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
std::unique_ptr<KernelContext> conv_1x1_context(
new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(conv_1x1_context->As<OpenCLContext>()));
kernel->SetContext(std::move(conv_1x1_context));
const DDim& input_dim =
lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
const DDim& filter_dim = lite::DDim{
std::vector<int64_t>({oc, filter_channel, ksize, ksize})};
const DDim& out_dim =
lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
// element wise bias
const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
LOG(INFO) << "input_dim:" << input_dim
<< " filter_dim:" << filter_dim
<< " out_dim:" << out_dim;
param.x->Resize(input_dim);
param.filter->Resize(filter_dim);
param.output->Resize(out_dim);
if (bias_flag) {
param.bias->Resize(bias_dim);
}
kernel->SetParam(param);
size_t input_image_width = iw * ((ic + 3) / 4);
size_t input_image_height = ih * batch_size;
size_t out_image_width = ow * ((oc + 3) / 4);
size_t out_image_height = oh * batch_size;
size_t bias_image_width = ow * ((oc + 3) / 4);
size_t bias_image_height = oh * batch_size;
size_t filter_image_width = ksize * ((filter_channel + 3) / 4);
size_t filter_image_height = oc * ksize;
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
std::default_random_engine engine;
std::uniform_real_distribution<float> gen(-5, 5);
std::vector<float> input_v(batch_size * ic * ih * iw);
std::vector<float> filter_v(oc * filter_channel * ksize * ksize);
std::vector<float> output_v(batch_size * oc * oh * ow);
std::vector<float> bias_v(oc);
SHADOW_LOG << "gen input and filter ...";
for (int i = 0; i < input_v.size(); ++i) {
input_v[i] = i; // gen(engine);
}
for (int i = 0; i < filter_v.size(); ++i) {
filter_v[i] = 1; // gen(engine);
}
SHADOW_LOG << "after gen input and filter ...";
SHADOW_LOG << "input_v.size(): " << input_v.size();
SHADOW_LOG << "filter_v.size(): " << filter_v.size();
SHADOW_LOG << "output_v.size(): " << output_v.size();
SHADOW_LOG << "bias_v.size(): " << bias_v.size();
SHADOW_LOG << "input_dim.production(): "
<< input_dim.production();
SHADOW_LOG << "filter_dim.production(): "
<< filter_dim.production();
SHADOW_LOG << "out_dim.production(): " << out_dim.production();
SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
SHADOW_LOG << "input_image_height:" << input_image_height
<< " input_image_width:" << input_image_width;
SHADOW_LOG << "filter_image_height:" << filter_image_height
<< " filter_image_width:" << filter_image_width;
SHADOW_LOG << "4 * input_image_height *input_image_width: "
<< 4 * input_image_height * input_image_width;
SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
<< 4 * filter_image_width * filter_image_height;
CHECK(input_dim.production() == input_v.size());
CHECK_LE(input_dim.production(),
4 * input_image_height * input_image_width);
CHECK(filter_dim.production() == filter_v.size());
CHECK_LE(filter_dim.production(),
4 * filter_image_width * filter_image_height);
paddle::lite::CLImageConverterDefault default_convertor;
SHADOW_LOG << "set mapped input ...";
std::vector<float> x_image_v(input_image_width *
input_image_height * 4); // 4 :RGBA
std::vector<float> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(out_image_width *
out_image_height * 4); // 4 :RGBA
default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim);
SHADOW_LOG << "输入: ---- ";
for (int i = 0; i < input_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << input_v[i];
}
SHADOW_LOG << "输入image : ---- ";
for (int i = 0; i < x_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << x_image_v[i];
}
SHADOW_LOG << "set mapped filter ...";
CLImageConverterFolder folder_convertor;
folder_convertor.NCHWToImage(
filter_v.data(), filter_image_v.data(), filter_dim);
SHADOW_LOG << "卷积核: ---- ";
for (int i = 0; i < filter_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_v[i];
}
SHADOW_LOG << "卷积核image: ---- ";
for (int i = 0; i < filter_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
}
auto* input_image2d = input.mutable_data<float, cl::Image2D>(
input_image_width, input_image_height, x_image_v.data());
// assign filter as target arm
filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
filter_dim);
// filter kernel
// auto* filter_image2d = filter.mutable_data<float,
// cl::Image2D>(
// filter_image_width,
// filter_image_height,
// filter_image_v.data());
if (bias_flag) {
for (int i = 0; i < bias_dim.production(); ++i) {
bias_v[i] = static_cast<int>(gen(engine));
}
bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
bias_dim);
// CLImageConverterFolder folder_convertor;
// folder_convertor.NCHWToImage(
// bias_v.data(), bias_image_v.data(),
// bias_dim);
//
// auto* bias_data = bias.mutable_data<float,
// cl::Image2D>(
// bias_image_width, bias_image_height,
// bias_image_v.data());
}
SHADOW_LOG << "resize output ...";
output.Resize(out_dim);
// cpu conv basic calc
lite::Tensor out_ref;
out_ref.Resize(out_dim);
SHADOW_LOG << "prepare kernel ready";
SHADOW_LOG << "kernel launch ...";
kernel->Launch();
SHADOW_LOG << "mutable output ...";
auto* output_image2d = output.mutable_data<float, cl::Image2D>(
out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
SHADOW_LOG << "--- Find the sync event for the target cl "
"tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
TargetWrapperCL::ImgcpySync(out_image_v.data(),
output.data<float, cl::Image2D>(),
out_image_width,
out_image_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
DDim out_image_shape =
default_convertor.InitImageDimInfoWith(output.dims());
default_convertor.ImageToNCHW(out_image_v.data(),
output_v.data(),
out_image_shape,
output.dims());
SHADOW_LOG << "输出: ---- ";
for (int i = 0; i < output_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << output_v[i];
}
SHADOW_LOG << "输出image: ---- ";
for (int i = 0; i < out_image_v.size(); i++) {
SHADOW_LOG << "(" << i << ")" << out_image_v[i];
}
SHADOW_LOG << "mutable_data out_ref_data: ";
// run cpu ref
auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
SHADOW_LOG << " conv_basic beigin ..... ";
conv_basic<float, float>(input_v.data(),
out_ref_data,
batch_size,
oc,
oh,
ow,
ic,
ih,
iw,
filter_v.data(),
bias_v.data(), // mapped_bias,
group,
ksize,
ksize,
stride,
stride,
dilation,
dilation,
pad,
pad,
bias_flag,
relu_flag);
SHADOW_LOG << " conv_basic end ..... ";
SHADOW_LOG << " out_dim: " << out_dim;
const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
{static_cast<int64_t>(out_image_width),
static_cast<int64_t>(out_image_height)})};
#ifdef PRINT_RESULT
for (int i = 0; i < out_dim.production(); i++) {
VLOG(4) << "output_v[" << i << "]:" << output_v[i]
<< " out_ref_data[" << i << "]:" << out_ref_data[i];
}
#endif
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
LOG(FATAL) << "error idx:" << i;
}
}
#ifdef LOOP_TEST
}
}
}
}
}
}
#else
// nothing to do.
#endif
}
#undef LOOP_TEST
#undef PRINT_RESULT
// #define PRINT_RESULT
// #define LOOP_TEST
TEST(conv2d, compute_image2d_5x5) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册