提交 7bba07bf 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Support video-sr feature using OpenCL FP16 Image (#3049)

* [LITE][OPENCL] Support video-sr feature using OpenCL FP16 Image. test=develop

* optimize image2d_to_buffer_with_post255. test=develop

* add def debug in cl kernel. test=develop

* remove conv image code in conv buffer. test=develop
上级 05b5ef29
...@@ -38,11 +38,13 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -38,11 +38,13 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
std::vector<std::string> passes{}; std::vector<std::string> passes{};
auto use_layout_preprocess_pass = auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS"); config.model_dir().find("OPENCL_PRE_PRECESS");
if (use_layout_preprocess_pass != std::string::npos) { VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
if (places[0].target == TARGET(kOpenCL) &&
use_layout_preprocess_pass != std::string::npos) {
passes = {"type_layout_cast_preprocess_pass"}; passes = {"type_layout_cast_preprocess_pass"};
VLOG(1) << "add pass:" << passes[0];
} }
raw_predictor_.Build(config, places, passes); raw_predictor_.Build(config, places, passes);
mode_ = config.power_mode(); mode_ = config.power_mode();
threads_ = config.threads(); threads_ = config.threads();
......
...@@ -38,6 +38,7 @@ void Tensor::Resize(const shape_t &shape) { ...@@ -38,6 +38,7 @@ void Tensor::Resize(const shape_t &shape) {
tensor(raw_tensor_)->Resize(shape); tensor(raw_tensor_)->Resize(shape);
} }
// Tensor::data
template <> template <>
const float *Tensor::data() const { const float *Tensor::data() const {
return ctensor(raw_tensor_)->data<float>(); return ctensor(raw_tensor_)->data<float>();
...@@ -47,15 +48,19 @@ const int8_t *Tensor::data() const { ...@@ -47,15 +48,19 @@ const int8_t *Tensor::data() const {
return ctensor(raw_tensor_)->data<int8_t>(); return ctensor(raw_tensor_)->data<int8_t>();
} }
template <> template <>
const uint8_t *Tensor::data() const {
return ctensor(raw_tensor_)->data<uint8_t>();
}
template <>
const int64_t *Tensor::data() const { const int64_t *Tensor::data() const {
return ctensor(raw_tensor_)->data<int64_t>(); return ctensor(raw_tensor_)->data<int64_t>();
} }
template <> template <>
const int32_t *Tensor::data() const { const int32_t *Tensor::data() const {
return ctensor(raw_tensor_)->data<int32_t>(); return ctensor(raw_tensor_)->data<int32_t>();
} }
// Tensor::mutable_data
template <> template <>
int *Tensor::mutable_data(TargetType type) const { int *Tensor::mutable_data(TargetType type) const {
return tensor(raw_tensor_)->mutable_data<int>(type); return tensor(raw_tensor_)->mutable_data<int>(type);
...@@ -69,6 +74,10 @@ int8_t *Tensor::mutable_data(TargetType type) const { ...@@ -69,6 +74,10 @@ int8_t *Tensor::mutable_data(TargetType type) const {
return tensor(raw_tensor_)->mutable_data<int8_t>(type); return tensor(raw_tensor_)->mutable_data<int8_t>(type);
} }
template <> template <>
uint8_t *Tensor::mutable_data(TargetType type) const {
return tensor(raw_tensor_)->mutable_data<uint8_t>(type);
}
template <>
int64_t *Tensor::mutable_data(TargetType type) const { int64_t *Tensor::mutable_data(TargetType type) const {
return tensor(raw_tensor_)->mutable_data<int64_t>(type); return tensor(raw_tensor_)->mutable_data<int64_t>(type);
} }
...@@ -116,18 +125,22 @@ void Tensor::CopyToCpu(T *data) const { ...@@ -116,18 +125,22 @@ void Tensor::CopyToCpu(T *data) const {
template void Tensor::CopyFromCpu<int, TargetType::kHost>(const int *); template void Tensor::CopyFromCpu<int, TargetType::kHost>(const int *);
template void Tensor::CopyFromCpu<float, TargetType::kHost>(const float *); template void Tensor::CopyFromCpu<float, TargetType::kHost>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kHost>(const int8_t *); template void Tensor::CopyFromCpu<int8_t, TargetType::kHost>(const int8_t *);
template void Tensor::CopyFromCpu<uint8_t, TargetType::kHost>(const uint8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *); template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *); template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *); template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
template void Tensor::CopyFromCpu<uint8_t, TargetType::kARM>(const uint8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *); template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *); template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *); template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *); template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
template void Tensor::CopyToCpu(int8_t *) const;
template void Tensor::CopyToCpu(float *) const; template void Tensor::CopyToCpu(float *) const;
template void Tensor::CopyToCpu(int *) const; template void Tensor::CopyToCpu(int *) const;
template void Tensor::CopyToCpu(int8_t *) const;
template void Tensor::CopyToCpu(uint8_t *) const;
shape_t Tensor::shape() const { shape_t Tensor::shape() const {
return ctensor(raw_tensor_)->dims().Vectorize(); return ctensor(raw_tensor_)->dims().Vectorize();
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
// #define DEBUG // #define DEBUG
////////////////////////////////////////////////////////
// buffer -> image2d // buffer -> image2d
////////////////////////////////////////////////////////
__kernel void buffer_to_image2d(__global CL_DTYPE *in, __kernel void buffer_to_image2d(__global CL_DTYPE *in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
...@@ -80,8 +82,9 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -80,8 +82,9 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
} }
////////////////////////////////////////////////////////
// image2d -> buffer // image2d -> buffer
////////////////////////////////////////////////////////
__kernel void image2d_to_buffer(__read_only image2d_t input, __kernel void image2d_to_buffer(__read_only image2d_t input,
__private const int in_width, __private const int in_width,
__private const int in_height, __private const int in_height,
...@@ -125,8 +128,10 @@ __kernel void image2d_to_buffer(__read_only image2d_t input, ...@@ -125,8 +128,10 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
} }
#if 0 #if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
////////////////////////////////////////////////////////
// buffer -> image2d_nw // buffer -> image2d_nw
////////////////////////////////////////////////////////
__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
...@@ -178,7 +183,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, ...@@ -178,7 +183,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
#endif #endif
#if 0 #if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
// image2d -> buffer // image2d -> buffer
__kernel void image2d_to_buffer_2d(__private const int in_height, __kernel void image2d_to_buffer_2d(__private const int in_height,
__private const int in_width, __private const int in_width,
...@@ -200,7 +205,9 @@ __kernel void image2d_to_buffer_2d(__private const int in_height, ...@@ -200,7 +205,9 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
} }
#endif #endif
////////////////////////////////////////////////////////
// buffer -> image2d (divide by 255 to normalize) // buffer -> image2d (divide by 255 to normalize)
////////////////////////////////////////////////////////
__kernel void buffer_to_image2d_with_pre255(__global uchar *in, __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
...@@ -248,7 +255,10 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in, ...@@ -248,7 +255,10 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
} }
////////////////////////////////////////////////////////
// image2d -> buffer (multiply by 255 to de-normalize) // image2d -> buffer (multiply by 255 to de-normalize)
////////////////////////////////////////////////////////
__kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
__private const int in_width, __private const int in_width,
__private const int in_height, __private const int in_height,
...@@ -267,17 +277,22 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, ...@@ -267,17 +277,22 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w); const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)); CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255;
#ifdef DEBUG
printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w);
#endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = convert_uchar_sat(in.x * 255); out[index] = convert_uchar_sat(in.x);
if(C - 4 * in_c>=2){ if(C - 4 * in_c>=2){
out[index + size_ch] = convert_uchar_sat(in.y * 255); out[index + size_ch] = convert_uchar_sat(in.y);
} }
if(C - 4 * in_c>=3){ if(C - 4 * in_c>=3){
out[index + size_ch * 2] = convert_uchar_sat(in.z * 255); out[index + size_ch * 2] = convert_uchar_sat(in.z);
} }
if(C - 4 * in_c>=4){ if(C - 4 * in_c>=4){
out[index + size_ch * 3] = convert_uchar_sat(in.w * 255); out[index + size_ch * 3] = convert_uchar_sat(in.w);
} }
} }
...@@ -217,7 +217,9 @@ void OpenCLTypeLayoutTransformPass::Apply( ...@@ -217,7 +217,9 @@ void OpenCLTypeLayoutTransformPass::Apply(
for (auto& node : nodes) { for (auto& node : nodes) {
VLOG(4) << "!node->IsStmt():" << !node->IsStmt(); VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
if (node->AsStmt().op_type() == "layout") { VLOG(1) << "node->AsStmt().op_type():" << node->AsStmt().op_type();
if (node->AsStmt().op_type() == "layout" ||
node->AsStmt().op_type() == "io_copy") {
auto new_op = node->AsStmt().mutable_op_info(); auto new_op = node->AsStmt().mutable_op_info();
int process_type = 1; int process_type = 1;
new_op->SetAttr("process_type", process_type); new_op->SetAttr("process_type", process_type);
......
...@@ -42,16 +42,11 @@ class IoCopyHostToOpenCLCompute ...@@ -42,16 +42,11 @@ class IoCopyHostToOpenCLCompute
CHECK(param.x->target() == TARGET(kHost) || CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kARM)); param.x->target() == TARGET(kARM));
auto mem_size = param.x->memory_size(); auto mem_size = param.x->memory_size();
VLOG(2) << "param.x->memory_size():" << mem_size;
VLOG(4) << "copy size " << mem_size; VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
VLOG(4) << "param.x->dims().size():" << param.x->dims().size(); VLOG(2) << "param.x->dims():" << param.x->dims();
VLOG(4) << "param.x->dims():" << param.x->dims()[0] << " " VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
<< param.x->dims()[1] << " " << param.x->dims()[2] << " " VLOG(2) << "param.y->dims():" << param.y->dims();
<< param.x->dims()[3];
VLOG(4) << "param.y->dims().size():" << param.y->dims().size();
VLOG(4) << "param.y->dims():" << param.y->dims()[0] << " "
<< param.y->dims()[1] << " " << param.y->dims()[2] << " "
<< param.y->dims()[3];
auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size); auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size);
CopyFromHostSync(data, param.x->raw_data(), mem_size); CopyFromHostSync(data, param.x->raw_data(), mem_size);
} }
...@@ -89,23 +84,27 @@ class IoCopykOpenCLToHostCompute ...@@ -89,23 +84,27 @@ class IoCopykOpenCLToHostCompute
auto& param = Param<operators::IoCopyParam>(); auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kOpenCL)); CHECK(param.x->target() == TARGET(kOpenCL));
auto mem_size = param.x->memory_size(); auto mem_size = param.x->memory_size();
VLOG(4) << "copy size " << mem_size;
VLOG(4) << "param.x->dims().size():" << param.x->dims().size(); VLOG(2) << "copy size " << mem_size;
VLOG(4) << "param.x->dims():" << param.x->dims()[0] << " " VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
<< param.x->dims()[1] << " " << param.x->dims()[2] << " " VLOG(2) << "param.x->dims():" << param.x->dims();
<< param.x->dims()[3]; VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
VLOG(4) << "param.y->dims().size():" << param.y->dims().size(); VLOG(2) << "param.y->dims():" << param.y->dims();
VLOG(4) << "param.y->dims():" << param.y->dims()[0] << " " VLOG(2) << "param.process_type:" << param.process_type;
<< param.y->dims()[1] << " " << param.y->dims()[2] << " "
<< param.y->dims()[3];
auto* data = param.y->mutable_data(TARGET(kHost), mem_size); auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
const cl::Buffer* x_ptr;
if (param.process_type == 1) {
x_ptr = param.x->data<uint8_t, cl::Buffer>();
} else {
x_ptr = param.x->data<float, cl::Buffer>();
}
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
auto* wait_list = context.cl_wait_list(); auto* wait_list = context.cl_wait_list();
auto* x_ptr = param.x->data<float, cl::Buffer>();
auto it = wait_list->find(x_ptr); auto it = wait_list->find(x_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
auto& event = *(it->second); auto& event = *(it->second);
event.wait(); event.wait();
} else { } else {
......
...@@ -42,6 +42,7 @@ class LayoutComputeBufferChwToImageDefault ...@@ -42,6 +42,7 @@ class LayoutComputeBufferChwToImageDefault
if (param.process_type == 1) { if (param.process_type == 1) {
kernel_func_name_ = "buffer_to_image2d_with_pre255"; kernel_func_name_ = "buffer_to_image2d_with_pre255";
} }
VLOG(2) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/layout_kernel.cl", build_options_); kernel_func_name_, "image/layout_kernel.cl", build_options_);
...@@ -73,20 +74,21 @@ class LayoutComputeBufferChwToImageDefault ...@@ -73,20 +74,21 @@ class LayoutComputeBufferChwToImageDefault
const int Stride1 = out_H * out_W; const int Stride1 = out_H * out_W;
const int Stride0 = out_W; const int Stride0 = out_W;
VLOG(4) << "y image_shape(w,h):" << image_shape["width"] << " " VLOG(2) << "param.process_type:" << param.process_type;
<< image_shape["height"]; VLOG(2) << "x_dims:" << x_dims;
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; VLOG(2) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
<< new_dims[1] << " " << new_dims[2] << " " << new_dims[3]; << new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
VLOG(4) << "out_C:" << out_C; VLOG(2) << "y_dims:" << y_dims;
VLOG(4) << "out_H:" << out_H; VLOG(2) << "param.y->memory_size():" << param.y->memory_size();
VLOG(4) << "out_W:" << out_W; VLOG(2) << "y image_shape(w,h):" << image_shape["width"] << " "
VLOG(4) << "Stride2:" << Stride2; << image_shape["height"];
VLOG(4) << "Stride1:" << Stride1; VLOG(2) << "out_C:" << out_C;
VLOG(4) << "Stride0:" << Stride0; VLOG(2) << "out_H:" << out_H;
VLOG(2) << "out_W:" << out_W;
VLOG(2) << "Stride2:" << Stride2;
VLOG(2) << "Stride1:" << Stride1;
VLOG(2) << "Stride0:" << Stride0;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
...@@ -112,7 +114,7 @@ class LayoutComputeBufferChwToImageDefault ...@@ -112,7 +114,7 @@ class LayoutComputeBufferChwToImageDefault
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2)); status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3] VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
<< " " << (new_dims[0] * new_dims[2]); << " " << (new_dims[0] * new_dims[2]);
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4), cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
...@@ -151,6 +153,7 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -151,6 +153,7 @@ class LayoutComputeImageDefaultToBufferChw
if (param.process_type == 1) { if (param.process_type == 1) {
kernel_func_name_ = "image2d_to_buffer_with_post255"; kernel_func_name_ = "image2d_to_buffer_with_post255";
} }
VLOG(2) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/layout_kernel.cl", build_options_); kernel_func_name_, "image/layout_kernel.cl", build_options_);
...@@ -174,14 +177,15 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -174,14 +177,15 @@ class LayoutComputeImageDefaultToBufferChw
new_dims[4 - x_dims.size() + j] = x_dims[j]; new_dims[4 - x_dims.size() + j] = x_dims[j];
} }
VLOG(4) << "x_image_shape(w,h):" << x_image_shape["width"] << " " VLOG(2) << "param.process_type:" << param.process_type;
VLOG(2) << "x_dims:" << x_dims;
VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
VLOG(2) << "x_image_shape(w,h):" << x_image_shape["width"] << " "
<< x_image_shape["height"]; << x_image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " VLOG(2) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
<< new_dims[1] << " " << new_dims[2] << " " << new_dims[3]; << new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
VLOG(2) << "y_dims:" << y_dims;
VLOG(2) << "param.y->memory_size():" << param.y->memory_size();
size_t C = new_dims[1]; size_t C = new_dims[1];
size_t in_height = new_dims[2]; size_t in_height = new_dims[2];
...@@ -213,7 +217,7 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -213,7 +217,7 @@ class LayoutComputeImageDefaultToBufferChw
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(C)); status = kernel.setArg(++arg_idx, static_cast<const int>(C));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3] VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
<< " " << (new_dims[0] * new_dims[2]); << " " << (new_dims[0] * new_dims[2]);
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4), cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
...@@ -307,7 +311,7 @@ class LayoutComputeBufferChwToImage2DNw ...@@ -307,7 +311,7 @@ class LayoutComputeBufferChwToImage2DNw
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2)); status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " " VLOG(2) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " "
<< (out_C * out_H); << (out_C * out_H);
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4), // N blocks cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4), // N blocks
......
...@@ -35,6 +35,9 @@ bool IoCopyOp::AttachImpl(const cpp::OpDesc &opdesc, ...@@ -35,6 +35,9 @@ bool IoCopyOp::AttachImpl(const cpp::OpDesc &opdesc,
auto out = opdesc.Output("Out").front(); auto out = opdesc.Output("Out").front();
param_.x = GetTensor(scope, x); param_.x = GetTensor(scope, x);
param_.y = GetMutableTensor(scope, out); param_.y = GetMutableTensor(scope, out);
if (opdesc.HasAttr("process_type")) {
param_.process_type = opdesc.GetAttr<int>("process_type");
}
return true; return true;
} }
std::string IoCopyOp::DebugString() const { return "io_copy_op"; } std::string IoCopyOp::DebugString() const { return "io_copy_op"; }
......
...@@ -57,6 +57,7 @@ struct FetchParam { ...@@ -57,6 +57,7 @@ struct FetchParam {
struct IoCopyParam { struct IoCopyParam {
const lite::Tensor* x{}; const lite::Tensor* x{};
lite::Tensor* y{}; lite::Tensor* y{};
int process_type{0};
}; };
struct LayoutParam { struct LayoutParam {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册