提交 9f343ac2 编写于 作者: 开心的小妮's avatar 开心的小妮

fix opencl concat. test=develop

上级 d51324bf
...@@ -11,6 +11,285 @@ limitations under the License. */ ...@@ -11,6 +11,285 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
__kernel void concatByCWith2Inputs(
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_W,
__read_only image2d_t input_image_0,
__private const int C_0,
__read_only image2d_t input_image_1,
__private const int C_1) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= out_C) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < C_0) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos);
} else {
c_in = c - C_0;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByCWith3Inputs(
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_W,
__read_only image2d_t input_image_0,
__private const int C_0,
__read_only image2d_t input_image_1,
__private const int C_1,
__read_only image2d_t input_image_2,
__private const int C_2) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= out_C) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < C_0) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos);
} else if (c < C_0 + C_1) {
c_in = c - C_0;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos);
} else {
c_in = c - C_0 - C_1;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_2, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByCWith4Inputs(
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_W,
__read_only image2d_t input_image_0,
__private const int C_0,
__read_only image2d_t input_image_1,
__private const int C_1,
__read_only image2d_t input_image_2,
__private const int C_2,
__read_only image2d_t input_image_3,
__private const int C_3) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= out_C) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < C_0) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos);
} else if (c < C_0 + C_1) {
c_in = c - C_0;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos);
} else if (c < C_0 + C_1 + C_2) {
c_in = c - C_0 - C_1;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_2, sampler, input_pos);
}else if (c < C_0 + C_1 + C_2 + C_3){
c_in = c - C_0 - C_1 - C_2;
int2 input_pos;
input_pos.x = (c_in / 4) * out_W + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_3, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByH(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_W,
__private const int out_H_Start) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos;
input_pos.x = in_c * out_W + in_w;
input_pos.y = in_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
int2 output_pos;
output_pos.x = input_pos.x;
output_pos.y = out_H_Start + input_pos.y;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
__kernel void concatByW(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int in_W,
__private const int pre_Width,
__private const int out_Width) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
int2 output_pos;
output_pos.x = input_pos.x + pre_Width + out_Width * in_c;
output_pos.y = input_pos.y;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
// deprecated
__kernel void concat2(__read_only image2d_t input0, __kernel void concat2(__read_only image2d_t input0,
__read_only image2d_t input1, __read_only image2d_t input1,
__write_only image2d_t output, __write_only image2d_t output,
...@@ -104,6 +383,7 @@ __kernel void concat2(__read_only image2d_t input0, ...@@ -104,6 +383,7 @@ __kernel void concat2(__read_only image2d_t input0,
} }
} }
// deprecated
__kernel void concat_mul(__read_only image2d_t input, __kernel void concat_mul(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
int flag, int C_0, int out_C, int out_W, int in_W, int width) { int flag, int C_0, int out_C, int out_W, int in_W, int width) {
...@@ -162,3 +442,6 @@ __kernel void concat_mul(__read_only image2d_t input, ...@@ -162,3 +442,6 @@ __kernel void concat_mul(__read_only image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
} }
} }
...@@ -161,8 +161,10 @@ class Optimizer { ...@@ -161,8 +161,10 @@ class Optimizer {
"runtime_context_assign_pass", "runtime_context_assign_pass",
"argument_type_display_pass", "argument_type_display_pass",
#ifndef LITE_WITH_PRECISION_PROFILE
"memory_optimize_pass"}}; "memory_optimize_pass"
#endif
}};
if (passes.size() == 1) { if (passes.size() == 1) {
// multi_stream_analysis_pass must be in the front of // multi_stream_analysis_pass must be in the front of
......
...@@ -36,6 +36,10 @@ ...@@ -36,6 +36,10 @@
#include "lite/backends/cuda/math/type_trans.h" #include "lite/backends/cuda/math/type_trans.h"
#endif #endif
#ifdef LITE_ON_TINY_PUBLISH
#include "lite/utils/replace_stl/stream.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace profile { namespace profile {
...@@ -88,19 +92,25 @@ class PrecisionProfiler { ...@@ -88,19 +92,25 @@ class PrecisionProfiler {
PrecisionProfiler() {} PrecisionProfiler() {}
std::string GetSummaryHeader() { std::string GetSummaryHeader() {
#ifdef LITE_ON_TINY_PUBLISH
using replace_stl::setw;
#else
using std::setw; using std::setw;
#endif
using std::left; using std::left;
using std::fixed; using std::fixed;
STL::stringstream ss; STL::stringstream ss;
ss << "\n\n========================================= " ss << "\n\n========================================= "
<< "Detailed Precision Profiler Summary " << "Detailed Precision Profiler Summary "
<< "=========================================" << std::endl; << "========================================="
<< "\n";
ss << setw(45) << left << "operator:(kernel_info)" ss << setw(45) << left << "operator:(kernel_info)"
<< " " << setw(70) << left << "output_tensor_name:(tensor_info)" << " " << setw(70) << left << "output_tensor_name:(tensor_info)"
<< " " << setw(15) << left << "dims" << " " << setw(15) << left << "dims"
<< " " << setw(15) << left << "mean" << " " << setw(15) << left << "mean"
<< " " << setw(15) << left << "std_deviation" << " " << setw(15) << left << "std_deviation"
<< " " << setw(15) << left << "ave_grow_rate*" << std::endl; << " " << setw(15) << left << "ave_grow_rate*"
<< "\n";
// write to file with path: `log_dir` // write to file with path: `log_dir`
if (log_dir_ != "") { if (log_dir_ != "") {
...@@ -368,7 +378,11 @@ class PrecisionProfiler { ...@@ -368,7 +378,11 @@ class PrecisionProfiler {
} }
std::string GetInstPrecision(const Instruction* inst = nullptr) { std::string GetInstPrecision(const Instruction* inst = nullptr) {
#ifdef LITE_ON_TINY_PUBLISH
using replace_stl::setw;
#else
using std::setw; using std::setw;
#endif
using std::left; using std::left;
using std::fixed; using std::fixed;
STL::stringstream ss; STL::stringstream ss;
...@@ -429,7 +443,7 @@ class PrecisionProfiler { ...@@ -429,7 +443,7 @@ class PrecisionProfiler {
<< output_arg_info << " " << setw(15) << left << tout->dims() << output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left << " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << " " << setw(15) << left << ave_grow_rate_str << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
<< std::endl; << "\n";
} else if (type->IsTensorList()) { } else if (type->IsTensorList()) {
auto touts = auto touts =
op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>(); op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
...@@ -466,7 +480,7 @@ class PrecisionProfiler { ...@@ -466,7 +480,7 @@ class PrecisionProfiler {
<< output_arg_info << " " << setw(15) << left << tout->dims() << output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left << " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << " " << setw(15) << left << ave_grow_rate_str << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
<< std::endl; << "\n";
} }
} }
} }
......
...@@ -38,213 +38,230 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -38,213 +38,230 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
concat_param_ = param_.get_mutable<param_t>(); concat_param_ = param_.get_mutable<param_t>();
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
}
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(kernel_func_name_,
"image/concat_kernel.cl",
build_options_,
time_stamp_);
auto input_num = concat_param_->x.size();
auto* output = concat_param_->output;
auto output_dims_size = output->dims().size();
auto axis = concat_param_->axis; auto axis = concat_param_->axis;
auto inputs = concat_param_->x; if (output_dims_size < 4) {
auto out_dims = concat_param_->output->dims(); if (output_dims_size - axis == 1) {
auto* axis_tensor = concat_param_->axis_tensor; kernel_func_name_ = "concatByW";
if (axis_tensor != nullptr) {
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto in_dims = inputs[0]->dims();
axis_size_ = out_dims[axis];
axis_ = axis;
if (out_dims.size() < 4) {
if (out_dims.size() - axis == 1) {
// width
width_ = out_dims[1]; // c
flag_ = 3;
} else { } else {
// height kernel_func_name_ = "concatByH";
width_ = out_dims[0]; // n
flag_ = 2;
}
} else {
switch (axis_) {
case 0:
width_ = out_dims[2]; // h
flag_ = 0;
break;
case 1: // channel
width_ = out_dims[3]; // w
flag_ = 1;
break;
case 2: // height
width_ = out_dims[0]; // n
flag_ = 2;
break;
case 3:
case -1: // width
width_ = out_dims[1]; // c
flag_ = 3;
break;
default:
printf("this axis: %d does not support \n", axis_);
} }
} } else if (output_dims_size == 4) { // output->dims.size() == 4
if (input_num == 2) {
for (int i = 1; i < inputs.size(); i++) { kernel_func_name_ = "concatByCWith2Inputs";
auto dims = inputs[i]->dims(); } else if (input_num == 3) {
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size()); kernel_func_name_ = "concatByCWith3Inputs";
if (in_dims.size() != dims.size()) { } else if (input_num == 4) {
printf("input shape must be same \n"); kernel_func_name_ = "concatByCWith4Inputs";
return; } else {
} LOG(FATAL) << "Unsupported input tensors number:" << input_num << ".";
for (int i = 0; i < dims.size(); i++) {
if (i != axis) {
if (in_dims[i] != dims[i]) {
printf("input shape must be same \n");
return;
}
}
} }
} else { // output->dims.size() > 4
LOG(FATAL) << "Unsupported output dims " << output->dims()
<< ", whose dims.size() is bigger than 4.";
} }
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(kernel_func_name_,
"image/concat_kernel.cl",
build_options_,
time_stamp_);
} }
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto output_tensor_dims = concat_param_->output->dims();
const auto& x_dims = param.output->dims(); auto output_image_shape = InitImageDimInfoWith(output_tensor_dims);
auto image_shape = InitImageDimInfoWith(x_dims); auto output_image_p =
auto* out_buf = param.output->mutable_data<half_t, cl::Image2D>( concat_param_->output->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]); output_image_shape["width"], output_image_shape["height"]);
const auto& y_dims = param.output->dims(); // useless: check dim only
auto inputs = concat_param_->x;
auto axis = concat_param_->axis;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto inputs = param.x; if (kernel_func_name_ == "concatByW" || kernel_func_name_ == "concatByH") {
int arg_idx = 0; auto output_tensor_w = output_tensor_dims[output_tensor_dims.size() - 1];
int width = inputs[0]->dims()[inputs[0]->dims().size() - 1]; if (output_tensor_dims.size() - axis == 1) {
for (size_t input_idx = 0; input_idx < inputs.size(); ++input_idx) {
auto* input = inputs[input_idx];
auto input_tensor_dims = input->dims();
auto input_image_shape = InitImageDimInfoWith(input_tensor_dims);
auto input_tensor_w = input_tensor_dims[input_tensor_dims.size() - 1];
auto* input_image_p = input->data<half_t, cl::Image2D>();
#ifdef LITE_WITH_LOG size_t input_tensor_pre_w = 0;
VLOG(4) << "concat input shape: "; for (size_t ii_idx = 0; ii_idx < input_idx; ++ii_idx) {
for (size_t i = 0; i < inputs.size(); i++) { auto input_tensor_dims = inputs[ii_idx]->dims();
VLOG(4) << "inputs [" << i << "]" input_tensor_pre_w +=
<< "[" << inputs[i]->dims().size() << "D]:" input_tensor_dims[input_tensor_dims.size() - 1];
<< " dims:" << inputs[i]->dims()[0] << " " }
<< inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " "
<< inputs[i]->dims()[3];
}
VLOG(4) << "concat output shape: "; int input_special_w = input_tensor_dims[output_tensor_dims.size() -
VLOG(4) << " out dims: " 2]; // not a good var name
<< "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
<< " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "axis_: " << axis_;
VLOG(4) << "flag_: " << flag_;
#endif
auto global_work_size = const std::vector<size_t>& default_work_size = DefaultWorkSize(
cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]), input_tensor_dims,
static_cast<cl::size_type>(image_shape["width"] / DDim(std::vector<DDim::value_type>{
x_dims[x_dims.size() - 1]), static_cast<int64_t>(input_image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])}; static_cast<int64_t>(input_image_shape["height"])}));
cl::NDRange global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size[0]),
static_cast<size_t>(default_work_size[1]),
static_cast<size_t>(default_work_size[2])};
cl_int status;
status = kernel.setArg(0, *input_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, *output_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, input_special_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, input_tensor_pre_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(4, output_tensor_w);
CL_CHECK_FATAL(status);
#ifdef LITE_WITH_LOG status = EnqueueNDRangeKernel(context,
VLOG(4) << TargetToStr(param.output->target()); kernel,
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " cl::NullRange,
<< image_shape["height"]; global_work_size,
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " cl::NullRange,
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3] nullptr,
<< "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1]; event_);
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " CL_CHECK_FATAL(status);
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; }
VLOG(4) << "width_: " << width_ << ", flag_: " << flag_; } else {
VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " " size_t output_image_height_start = 0; // output image height start
<< (image_shape["width"] / x_dims[x_dims.size() - 1]) << " " for (size_t input_idx = 0; input_idx < inputs.size(); ++input_idx) {
<< (image_shape["height"]); auto* input = inputs[input_idx];
#endif auto input_tensor_dims = input->dims();
auto input_image_shape = InitImageDimInfoWith(input_tensor_dims);
auto input_tensor_w = input_tensor_dims[input_tensor_dims.size() - 1];
auto* input_image_p = input->data<half_t, cl::Image2D>();
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); const std::vector<size_t>& default_work_size = DefaultWorkSize(
int out_w = x_dims[x_dims.size() - 1]; input_tensor_dims,
int out_c = x_dims[1]; DDim(std::vector<DDim::value_type>{
if (inputs.size() == 2) { static_cast<int64_t>(input_image_shape["width"]),
auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>(); static_cast<int64_t>(input_image_shape["height"])}));
auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>(); cl::NDRange global_work_size =
cl_int status = kernel.setArg(arg_idx, *x_buf0); cl::NDRange{static_cast<size_t>(default_work_size[0]),
CL_CHECK_FATAL(status); static_cast<size_t>(default_work_size[1]),
status = kernel.setArg(++arg_idx, *x_buf1); static_cast<size_t>(default_work_size[2])};
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); cl_int status;
status = kernel.setArg(0, *input_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, *output_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, output_tensor_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, output_image_height_start);
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
// compute new output_image_height_start
if (output_tensor_dims.size() == 3) {
output_image_height_start += input_tensor_dims[1];
} else if (output_tensor_dims.size() == 2) {
output_image_height_start += input_tensor_dims[0];
}
}
}
} else if (kernel_func_name_ == "concatByCWith2Inputs" ||
kernel_func_name_ == "concatByCWith3Inputs" ||
kernel_func_name_ == "concatByCWith4Inputs") {
auto* input0 = inputs[0];
auto* input0_image_p = input0->data<half_t, cl::Image2D>();
size_t input0_tensor_c = input0->dims()[1];
auto* input1 = inputs.size() >= 2 ? inputs[1] : nullptr;
auto* input1_image_p =
input1 ? input1->data<half_t, cl::Image2D>() : nullptr;
size_t input1_tensor_c = input1 ? input1->dims()[1] : -1;
auto* input2 = inputs.size() >= 3 ? inputs[2] : nullptr;
auto* input2_image_p =
input2 ? input2->data<half_t, cl::Image2D>() : nullptr;
size_t input2_tensor_c = input2 ? input2->dims()[1] : -1;
auto* input3 = inputs.size() >= 4 ? inputs[3] : nullptr;
auto* input3_image_p =
input3 ? input3->data<half_t, cl::Image2D>() : nullptr;
size_t input3_tensor_c = input3 ? input3->dims()[1] : -1;
LOG(INFO) << "input0_image_p:" << input0_image_p;
LOG(INFO) << "input0_tensor_c:" << input0_tensor_c;
LOG(INFO) << "input1_image_p:" << input1_image_p;
LOG(INFO) << "input1_tensor_c:" << input1_tensor_c;
LOG(INFO) << "input2_image_p:" << input2_image_p;
LOG(INFO) << "input2_tensor_c:" << input2_tensor_c;
LOG(INFO) << "input3_image_p:" << input3_image_p;
LOG(INFO) << "input3_tensor_c:" << input3_tensor_c;
const std::vector<size_t>& default_work_size = DefaultWorkSize(
output_tensor_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(output_image_shape["width"]),
static_cast<int64_t>(output_image_shape["height"])}));
cl::NDRange global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size[0]),
static_cast<size_t>(default_work_size[1]),
static_cast<size_t>(default_work_size[2])};
cl_int status;
status = kernel.setArg(0, *output_image_p);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag_); status = kernel.setArg(
1, static_cast<size_t>(output_tensor_dims[1])); // output_tensor_c
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = status = kernel.setArg(
kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_])); 2, static_cast<size_t>(output_tensor_dims[3])); // output_tensor_w
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_c); status = kernel.setArg(3, *input0_image_p);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_w); status = kernel.setArg(4, input0_tensor_c);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width_); status = kernel.setArg(5, *input1_image_p);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(6, input1_tensor_c);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} else { if (inputs.size() >= 3) {
auto start = 0; status = kernel.setArg(7, *input2_image_p);
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto in_dims = inputs[i]->dims();
image_shape = InitImageDimInfoWith(in_dims);
auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
int in_w = in_dims[in_dims.size() - 1];
#ifdef LITE_WITH_LOG
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
#endif
global_work_size =
cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["width"] /
in_dims[in_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["height"])};
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, flag_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, start);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_c); status = kernel.setArg(8, input2_tensor_c);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_w); }
CL_CHECK_FATAL(status); if (inputs.size() == 4) {
status = kernel.setArg(++arg_idx, in_w); status = kernel.setArg(9, *input3_image_p);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, width_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(10, input3_tensor_c);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
nullptr);
CL_CHECK_FATAL(status);
start += inputs[i]->dims()[axis_];
} }
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
} else {
LOG(FATAL) << "Unsupported kernel func name: " << kernel_func_name_;
} }
} }
......
...@@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() { ...@@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() {
impl_ = &ConvImageCompute::DepthwiseConv2d; impl_ = &ConvImageCompute::DepthwiseConv2d;
} else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) { } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
// #define CONV3x3OPT_FALL_BACK
#ifndef CONV3x3OPT_FALL_BACK
// conv2d_3x3 // conv2d_3x3
kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" if (groups_ == 1) {
: "conv2d_3x3_opt"); kernel_func_names_.push_back(
kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
CLImageConverterFolder converter; impl_ = &ConvImageCompute::Conv2d3x3opt;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); } else { // groups_ > 1
filter_image_h_ = filter_image_dims[1]; kernel_func_names_.push_back("conv2d_3x3");
filter_image_w_ = filter_image_dims[0]; kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); impl_ = &ConvImageCompute::Conv2d3x3;
}
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3opt;
#else
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
CLImageConverterFolder converter; CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1]; filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0]; filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data = half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>(); tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>( filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_w_, filter_image_h_, filter_image_data); filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3;
#endif
#undef CONV3x3OPT_FALL_BACK
} else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) { } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
#define CONV_5x5_OPT #define CONV_5x5_OPT
#ifndef CONV_5x5_OPT #ifndef CONV_5x5_OPT
...@@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() { ...@@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() {
static_cast<size_t>(w_blk_), static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)}; static_cast<size_t>(nh_blk_)};
input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4); input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
} else if (kernel_func_names_[0] == "conv2d_3x3") {
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
} else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" || } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
kernel_func_names_[0] == "conv2d_3x3_opt") { kernel_func_names_[0] == "conv2d_3x3_opt") {
int w_blk_size = 5; int w_blk_size = 5;
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5)) #define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))
#define IS_DIFF_PASSED(res0, res1, threshold) \ #define IS_DIFF_PASSED(res0, res1, threshold) \
(((COMPTUE_ABS_DIFF(res0, res1) < threshold) || \ (((COMPUTE_ABS_DIFF(res0, res1) < threshold) || \
(COMPUTE_RELATIVE_DIFF(res0, res1) < threshold)) \ (COMPUTE_RELATIVE_DIFF(res0, res1) < threshold)) \
? true \ ? true \
: false) : false)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册