提交 9f343ac2 编写于 作者: 开心的小妮's avatar 开心的小妮

fix opencl concat. test=develop

上级 d51324bf
......@@ -11,6 +11,285 @@ limitations under the License. */
#include <cl_common.h>
__kernel void concatByCWith2Inputs(
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_W,
__read_only image2d_t input_image_0,
__private const int C_0,
__read_only image2d_t input_image_1,
__private const int C_1) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= out_C) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < C_0) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos);
} else {
c_in = c - C_0;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByCWith3Inputs(
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_W,
__read_only image2d_t input_image_0,
__private const int C_0,
__read_only image2d_t input_image_1,
__private const int C_1,
__read_only image2d_t input_image_2,
__private const int C_2) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= out_C) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < C_0) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos);
} else if (c < C_0 + C_1) {
c_in = c - C_0;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos);
} else {
c_in = c - C_0 - C_1;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_2, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByCWith4Inputs(
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_W,
__read_only image2d_t input_image_0,
__private const int C_0,
__read_only image2d_t input_image_1,
__private const int C_1,
__read_only image2d_t input_image_2,
__private const int C_2,
__read_only image2d_t input_image_3,
__private const int C_3) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= out_C) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < C_0) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos);
} else if (c < C_0 + C_1) {
c_in = c - C_0;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos);
} else if (c < C_0 + C_1 + C_2) {
c_in = c - C_0 - C_1;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_2, sampler, input_pos);
}else if (c < C_0 + C_1 + C_2 + C_3){
c_in = c - C_0 - C_1 - C_2;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_3, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByH(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_W,
__private const int out_H_Start) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos;
input_pos.x = in_c * out_W + in_w;
input_pos.y = in_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
int2 output_pos;
output_pos.x = input_pos.x;
output_pos.y = out_H_Start + input_pos.y;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
__kernel void concatByW(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int in_W,
__private const int pre_Width,
__private const int out_Width) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
int2 output_pos;
output_pos.x = input_pos.x + pre_Width + out_Width * in_c;
output_pos.y = input_pos.y;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
// deprecated
__kernel void concat2(__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
......@@ -104,6 +383,7 @@ __kernel void concat2(__read_only image2d_t input0,
}
}
// deprecated
__kernel void concat_mul(__read_only image2d_t input,
__write_only image2d_t output,
int flag, int C_0, int out_C, int out_W, int in_W, int width) {
......@@ -162,3 +442,6 @@ __kernel void concat_mul(__read_only image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
}
}
......@@ -161,8 +161,10 @@ class Optimizer {
"runtime_context_assign_pass",
"argument_type_display_pass",
"memory_optimize_pass"}};
#ifndef LITE_WITH_PRECISION_PROFILE
"memory_optimize_pass"
#endif
}};
if (passes.size() == 1) {
// multi_stream_analysis_pass must be in the front of
......
......@@ -36,6 +36,10 @@
#include "lite/backends/cuda/math/type_trans.h"
#endif
#ifdef LITE_ON_TINY_PUBLISH
#include "lite/utils/replace_stl/stream.h"
#endif
namespace paddle {
namespace lite {
namespace profile {
......@@ -88,19 +92,25 @@ class PrecisionProfiler {
PrecisionProfiler() {}
std::string GetSummaryHeader() {
#ifdef LITE_ON_TINY_PUBLISH
using replace_stl::setw;
#else
using std::setw;
#endif
using std::left;
using std::fixed;
STL::stringstream ss;
ss << "\n\n========================================= "
<< "Detailed Precision Profiler Summary "
<< "=========================================" << std::endl;
<< "========================================="
<< "\n";
ss << setw(45) << left << "operator:(kernel_info)"
<< " " << setw(70) << left << "output_tensor_name:(tensor_info)"
<< " " << setw(15) << left << "dims"
<< " " << setw(15) << left << "mean"
<< " " << setw(15) << left << "std_deviation"
<< " " << setw(15) << left << "ave_grow_rate*" << std::endl;
<< " " << setw(15) << left << "ave_grow_rate*"
<< "\n";
// write to file with path: `log_dir`
if (log_dir_ != "") {
......@@ -368,7 +378,11 @@ class PrecisionProfiler {
}
std::string GetInstPrecision(const Instruction* inst = nullptr) {
#ifdef LITE_ON_TINY_PUBLISH
using replace_stl::setw;
#else
using std::setw;
#endif
using std::left;
using std::fixed;
STL::stringstream ss;
......@@ -429,7 +443,7 @@ class PrecisionProfiler {
<< output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << " " << setw(15) << left << ave_grow_rate_str
<< std::endl;
<< "\n";
} else if (type->IsTensorList()) {
auto touts =
op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
......@@ -466,7 +480,7 @@ class PrecisionProfiler {
<< output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << " " << setw(15) << left << ave_grow_rate_str
<< std::endl;
<< "\n";
}
}
}
......
......@@ -38,213 +38,230 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(kernel_func_name_,
"image/concat_kernel.cl",
build_options_,
time_stamp_);
auto input_num = concat_param_->x.size();
auto* output = concat_param_->output;
auto output_dims_size = output->dims().size();
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
auto out_dims = concat_param_->output->dims();
auto* axis_tensor = concat_param_->axis_tensor;
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
if (out_dims.size() < 4) {
if (out_dims.size() - axis == 1) {
// width
width_ = out_dims[1]; // c
flag_ = 3;
if (output_dims_size < 4) {
if (output_dims_size - axis == 1) {
kernel_func_name_ = "concatByW";
} else {
// height
width_ = out_dims[0]; // n
flag_ = 2;
}
} else {
switch (axis_) {
case 0:
width_ = out_dims[2]; // h
flag_ = 0;
break;
case 1: // channel
width_ = out_dims[3]; // w
flag_ = 1;
break;
case 2: // height
width_ = out_dims[0]; // n
flag_ = 2;
break;
case 3:
case -1: // width
width_ = out_dims[1]; // c
flag_ = 3;
break;
default:
printf("this axis: %d does not support \n", axis_);
kernel_func_name_ = "concatByH";
}
}
for (int i = 1; i < inputs.size(); i++) {
auto dims = inputs[i]->dims();
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
if (in_dims.size() != dims.size()) {
printf("input shape must be same \n");
return;
}
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
} else if (output_dims_size == 4) { // output->dims.size() == 4
if (input_num == 2) {
kernel_func_name_ = "concatByCWith2Inputs";
} else if (input_num == 3) {
kernel_func_name_ = "concatByCWith3Inputs";
} else if (input_num == 4) {
kernel_func_name_ = "concatByCWith4Inputs";
} else {
LOG(FATAL) << "Unsupported input tensors number:" << input_num << ".";
}
} else { // output->dims.size() > 4
LOG(FATAL) << "Unsupported output dims " << output->dims()
<< ", whose dims.size() is bigger than 4.";
}
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(kernel_func_name_,
"image/concat_kernel.cl",
build_options_,
time_stamp_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.output->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
auto output_tensor_dims = concat_param_->output->dims();
auto output_image_shape = InitImageDimInfoWith(output_tensor_dims);
auto output_image_p =
concat_param_->output->mutable_data<half_t, cl::Image2D>(
output_image_shape["width"], output_image_shape["height"]);
auto inputs = concat_param_->x;
auto axis = concat_param_->axis;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto inputs = param.x;
int arg_idx = 0;
int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
if (kernel_func_name_ == "concatByW" || kernel_func_name_ == "concatByH") {
auto output_tensor_w = output_tensor_dims[output_tensor_dims.size() - 1];
if (output_tensor_dims.size() - axis == 1) {
for (size_t input_idx = 0; input_idx < inputs.size(); ++input_idx) {
auto* input = inputs[input_idx];
auto input_tensor_dims = input->dims();
auto input_image_shape = InitImageDimInfoWith(input_tensor_dims);
auto input_tensor_w = input_tensor_dims[input_tensor_dims.size() - 1];
auto* input_image_p = input->data<half_t, cl::Image2D>();
#ifdef LITE_WITH_LOG
VLOG(4) << "concat input shape: ";
for (size_t i = 0; i < inputs.size(); i++) {
VLOG(4) << "inputs [" << i << "]"
<< "[" << inputs[i]->dims().size() << "D]:"
<< " dims:" << inputs[i]->dims()[0] << " "
<< inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " "
<< inputs[i]->dims()[3];
}
size_t input_tensor_pre_w = 0;
for (size_t ii_idx = 0; ii_idx < input_idx; ++ii_idx) {
auto input_tensor_dims = inputs[ii_idx]->dims();
input_tensor_pre_w +=
input_tensor_dims[input_tensor_dims.size() - 1];
}
VLOG(4) << "concat output shape: ";
VLOG(4) << " out dims: "
<< "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
<< " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "axis_: " << axis_;
VLOG(4) << "flag_: " << flag_;
#endif
int input_special_w = input_tensor_dims[output_tensor_dims.size() -
2]; // not a good var name
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["width"] /
x_dims[x_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["height"])};
const std::vector<size_t>& default_work_size = DefaultWorkSize(
input_tensor_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(input_image_shape["width"]),
static_cast<int64_t>(input_image_shape["height"])}));
cl::NDRange global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size[0]),
static_cast<size_t>(default_work_size[1]),
static_cast<size_t>(default_work_size[2])};
cl_int status;
status = kernel.setArg(0, *input_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, *output_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, input_special_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, input_tensor_pre_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(4, output_tensor_w);
CL_CHECK_FATAL(status);
#ifdef LITE_WITH_LOG
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]
<< "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "width_: " << width_ << ", flag_: " << flag_;
VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " "
<< (image_shape["width"] / x_dims[x_dims.size() - 1]) << " "
<< (image_shape["height"]);
#endif
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
} else {
size_t output_image_height_start = 0; // output image height start
for (size_t input_idx = 0; input_idx < inputs.size(); ++input_idx) {
auto* input = inputs[input_idx];
auto input_tensor_dims = input->dims();
auto input_image_shape = InitImageDimInfoWith(input_tensor_dims);
auto input_tensor_w = input_tensor_dims[input_tensor_dims.size() - 1];
auto* input_image_p = input->data<half_t, cl::Image2D>();
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int out_w = x_dims[x_dims.size() - 1];
int out_c = x_dims[1];
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
const std::vector<size_t>& default_work_size = DefaultWorkSize(
input_tensor_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(input_image_shape["width"]),
static_cast<int64_t>(input_image_shape["height"])}));
cl::NDRange global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size[0]),
static_cast<size_t>(default_work_size[1]),
static_cast<size_t>(default_work_size[2])};
cl_int status;
status = kernel.setArg(0, *input_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, *output_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, output_tensor_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, output_image_height_start);
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
// compute new output_image_height_start
if (output_tensor_dims.size() == 3) {
output_image_height_start += input_tensor_dims[1];
} else if (output_tensor_dims.size() == 2) {
output_image_height_start += input_tensor_dims[0];
}
}
}
} else if (kernel_func_name_ == "concatByCWith2Inputs" ||
kernel_func_name_ == "concatByCWith3Inputs" ||
kernel_func_name_ == "concatByCWith4Inputs") {
auto* input0 = inputs[0];
auto* input0_image_p = input0->data<half_t, cl::Image2D>();
size_t input0_tensor_c = input0->dims()[1];
auto* input1 = inputs.size() >= 2 ? inputs[1] : nullptr;
auto* input1_image_p =
input1 ? input1->data<half_t, cl::Image2D>() : nullptr;
size_t input1_tensor_c = input1 ? input1->dims()[1] : -1;
auto* input2 = inputs.size() >= 3 ? inputs[2] : nullptr;
auto* input2_image_p =
input2 ? input2->data<half_t, cl::Image2D>() : nullptr;
size_t input2_tensor_c = input2 ? input2->dims()[1] : -1;
auto* input3 = inputs.size() >= 4 ? inputs[3] : nullptr;
auto* input3_image_p =
input3 ? input3->data<half_t, cl::Image2D>() : nullptr;
size_t input3_tensor_c = input3 ? input3->dims()[1] : -1;
LOG(INFO) << "input0_image_p:" << input0_image_p;
LOG(INFO) << "input0_tensor_c:" << input0_tensor_c;
LOG(INFO) << "input1_image_p:" << input1_image_p;
LOG(INFO) << "input1_tensor_c:" << input1_tensor_c;
LOG(INFO) << "input2_image_p:" << input2_image_p;
LOG(INFO) << "input2_tensor_c:" << input2_tensor_c;
LOG(INFO) << "input3_image_p:" << input3_image_p;
LOG(INFO) << "input3_tensor_c:" << input3_tensor_c;
const std::vector<size_t>& default_work_size = DefaultWorkSize(
output_tensor_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(output_image_shape["width"]),
static_cast<int64_t>(output_image_shape["height"])}));
cl::NDRange global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size[0]),
static_cast<size_t>(default_work_size[1]),
static_cast<size_t>(default_work_size[2])};
cl_int status;
status = kernel.setArg(0, *output_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag_);
status = kernel.setArg(
1, static_cast<size_t>(output_tensor_dims[1])); // output_tensor_c
CL_CHECK_FATAL(status);
status =
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
status = kernel.setArg(
2, static_cast<size_t>(output_tensor_dims[3])); // output_tensor_w
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_c);
status = kernel.setArg(3, *input0_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_w);
status = kernel.setArg(4, input0_tensor_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width_);
status = kernel.setArg(5, *input1_image_p);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
nullptr);
status = kernel.setArg(6, input1_tensor_c);
CL_CHECK_FATAL(status);
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto in_dims = inputs[i]->dims();
image_shape = InitImageDimInfoWith(in_dims);
auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
int in_w = in_dims[in_dims.size() - 1];
#ifdef LITE_WITH_LOG
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
#endif
global_work_size =
cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["width"] /
in_dims[in_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["height"])};
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
if (inputs.size() >= 3) {
status = kernel.setArg(7, *input2_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_c);
status = kernel.setArg(8, input2_tensor_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width_);
}
if (inputs.size() == 4) {
status = kernel.setArg(9, *input3_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(10, input3_tensor_c);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
nullptr);
CL_CHECK_FATAL(status);
start += inputs[i]->dims()[axis_];
}
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
} else {
LOG(FATAL) << "Unsupported kernel func name: " << kernel_func_name_;
}
}
......
......@@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() {
impl_ = &ConvImageCompute::DepthwiseConv2d;
} else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
// #define CONV3x3OPT_FALL_BACK
#ifndef CONV3x3OPT_FALL_BACK
// conv2d_3x3
kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch"
: "conv2d_3x3_opt");
kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3opt;
#else
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
if (groups_ == 1) {
kernel_func_names_.push_back(
input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
impl_ = &ConvImageCompute::Conv2d3x3opt;
} else { // groups_ > 1
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
impl_ = &ConvImageCompute::Conv2d3x3;
}
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3;
#endif
#undef CONV3x3OPT_FALL_BACK
} else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
#define CONV_5x5_OPT
#ifndef CONV_5x5_OPT
......@@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() {
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
} else if (kernel_func_names_[0] == "conv2d_3x3") {
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
} else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
kernel_func_names_[0] == "conv2d_3x3_opt") {
int w_blk_size = 5;
......
......@@ -19,7 +19,7 @@
#define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))
#define IS_DIFF_PASSED(res0, res1, threshold) \
(((COMPTUE_ABS_DIFF(res0, res1) < threshold) || \
(((COMPUTE_ABS_DIFF(res0, res1) < threshold) || \
(COMPUTE_RELATIVE_DIFF(res0, res1) < threshold)) \
? true \
: false)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册