diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl index 5dc264f7e5b0e276c37566393acc355d83c4fed7..6fe5596a4cf5cbce5b50c9a3d53be164aad8a0b5 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl @@ -1,28 +1,28 @@ #include -__kernel void conv2d_1x1( - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, +__kernel void conv2d_1x1(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif - #ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, - #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int input_c_origin, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w) { +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int input_c_origin, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w) { CL_DTYPE zero = 0.0f; const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -61,12 +61,13 @@ __kernel void conv2d_1x1( ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - CL_DTYPE4 output0 = read_imagef(bias, sampler, (int2)(out_c, 0)); + CL_DTYPE4 output0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); CL_DTYPE4 output1 = output0; CL_DTYPE4 output2 = output0; CL_DTYPE4 output3 = output0; #elif defined(BIASE_ELE) - CL_DTYPE4 output0 = read_imagef(bias, sampler, output_pos0); + CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0); CL_DTYPE4 output1 = output0; CL_DTYPE4 output2 = output0; CL_DTYPE4 output3 = output0; @@ -89,12 +90,17 @@ __kernel void conv2d_1x1( // ------------0--------------- int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); - CL_DTYPE4 input0 = read_imagef(input_image, sampler, pos_in); - - CL_DTYPE4 weight0 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 0)); - CL_DTYPE4 weight1 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 1)); - CL_DTYPE4 weight2 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 2)); - CL_DTYPE4 weight3 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 3)); + CL_DTYPE4 input0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); + + CL_DTYPE4 weight0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0)); + CL_DTYPE4 weight1 = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1)); + CL_DTYPE4 weight2 = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2)); + CL_DTYPE4 weight3 = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3)); int bound_gap = max_w_bound - pos_in.x - 1; bool outof_bound = bound_gap < input_width && bound_gap >= 0; @@ -109,7 +115,8 @@ __kernel void conv2d_1x1( // -------------1-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); - CL_DTYPE4 input1 = read_imagef(input_image, sampler, pos_in); + CL_DTYPE4 input1 = + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); bound_gap = max_w_bound - pos_in.x - 1; @@ -126,7 +133,8 @@ __kernel void conv2d_1x1( // -------------2-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); - CL_DTYPE4 input2 = read_imagef(input_image, sampler, pos_in); + CL_DTYPE4 input2 = + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); bound_gap = max_w_bound - pos_in.x - 1; @@ -143,7 +151,8 @@ __kernel void conv2d_1x1( // -------------3-------------- pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); - CL_DTYPE4 input3 = read_imagef(input_image, sampler, pos_in); + CL_DTYPE4 input3 = + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in); bound_gap = max_w_bound - pos_in.x - 1; outof_bound = bound_gap < input_width && bound_gap >= 0; @@ -165,17 +174,21 @@ __kernel void conv2d_1x1( } #ifdef BATCH_NORM - output0 = output0 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) + - read_imagef(new_biase, sampler, (int2)(out_c, 0)); + output0 = output0 * READ_IMG_TYPE( + CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); - output1 = output1 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) + - read_imagef(new_biase, sampler, (int2)(out_c, 0)); + output1 = output1 * READ_IMG_TYPE( + CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); - output2 = output2 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) + - read_imagef(new_biase, sampler, (int2)(out_c, 0)); + output2 = output2 * READ_IMG_TYPE( + CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); - output3 = output3 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) + - read_imagef(new_biase, sampler, (int2)(out_c, 0)); + output3 = output3 * READ_IMG_TYPE( + CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU @@ -186,18 +199,18 @@ __kernel void conv2d_1x1( #endif if (out_w0 < old_w) { - write_imagef(output_image, output_pos0, output0); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0); } if (out_w1 < old_w) { - write_imagef(output_image, output_pos1, output1); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1); } if (out_w2 < old_w) { - write_imagef(output_image, output_pos2, output2); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2); } if (out_w3 < old_w) { - write_imagef(output_image, output_pos3, output3); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3); } } diff --git a/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..314be875d29d2125f9573d33010ee9d33317ea71 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl @@ -0,0 +1,162 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void reshape(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H, + __private const int in_Stride0, + __private const int in_Stride1, + __private const int in_Stride2, + __private const int out_Stride0, + __private const int out_Stride1, + __private const int out_Stride2) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + const int out_c0 = out_c * 4; + const int out_c1 = out_c * 4 + 1; + const int out_c2 = out_c * 4 + 2; + const int out_c3 = out_c * 4 + 3; + + int count0 = + out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w; + int count1 = + out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w; + int count2 = + out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w; + int count3 = + out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w; + + int in_n0 = count0 / in_Stride2; + int in_n1 = count1 / in_Stride2; + int in_n2 = count1 / in_Stride2; + int in_n3 = count2 / in_Stride2; + + count0 = count0 % in_Stride2; + count1 = count1 % in_Stride2; + count2 = count2 % in_Stride2; + count3 = count3 % in_Stride2; + + int in_c0 = count0 / in_Stride1; + int in_c1 = count1 / in_Stride1; + int in_c2 = count2 / in_Stride1; + int in_c3 = count3 / in_Stride1; + + int in_h0 = (count0 % in_Stride1) / in_Stride0; + int in_h1 = (count1 % in_Stride1) / in_Stride0; + int in_h2 = (count2 % in_Stride1) / in_Stride0; + int in_h3 = (count3 % in_Stride1) / in_Stride0; + + int in_w0 = (count0 % in_Stride1) % in_Stride0; + int in_w1 = (count1 % in_Stride1) % in_Stride0; + int in_w2 = (count2 % in_Stride1) % in_Stride0; + int in_w3 = (count3 % in_Stride1) % in_Stride0; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + + input_pos0.x = (in_c0 / 4) * in_W + in_w0; + input_pos0.y = in_n0 * in_H + in_h0; + + input_pos1.x = (in_c1 / 4) * in_W + in_w1; + input_pos1.y = in_n1 * in_H + in_h1; + + input_pos2.x = (in_c2 / 4) * in_W + in_w2; + input_pos2.y = in_n2 * in_H + in_h2; + + input_pos3.x = (in_c3 / 4) * in_W + in_w3; + input_pos3.y = in_n3 * in_H + in_h3; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0); + if (in_c0 % 4 == 0) { + output.x = input0.x; + } else if (in_c0 % 4 == 1) { + output.x = input0.y; + } else if (in_c0 % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1); + if (in_c1 % 4 == 0) { + output.y = input1.x; + } else if (in_c1 % 4 == 1) { + output.y = input1.y; + } else if (in_c1 % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2); + + if (in_c2 % 4 == 0) { + output.z = input2.x; + } else if (in_c2 % 4 == 1) { + output.z = input1.y; + } else if (in_c2 % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3); + if (in_c3 % 4 == 0) { + output.w = input3.x; + } else if (in_c3 % 4 == 1) { + output.w = input3.y; + } else if (in_c3 % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index 13f527a2200198569054f2314e190b958973004e..3423b1e920e5e7c4aaa34125303b09d943e47b62 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -15,6 +15,7 @@ add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${te add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps}) add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) @@ -51,7 +52,9 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc DEPS conv2d_1x1_opencl cl_image_converter op_registry program context ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) - +lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc + DEPS reshape_opencl cl_image_converter op_registry program context + ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc DEPS conv_opencl op_registry program context ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) diff --git a/lite/kernels/opencl/reshape_compute.cc b/lite/kernels/opencl/reshape_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..7af648c5601e0a516eb92b3090cb8d7e836a5447 --- /dev/null +++ b/lite/kernels/opencl/reshape_compute.cc @@ -0,0 +1,206 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/opencl/cl_include.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" +#include "lite/operators/op_params.h" +#include "lite/utils/logging.h" +#include "lite/utils/replace_stl/stream.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + +// reshape operator +class ReshapeComputeFloatImage : public KernelLite { + public: + using param_t = operators::ReshapeParam; + + void PrepareForRun() override { + auto& context = ctx_->As(); + context.cl_context()->AddKernel( + kernel_func_name_, "image/reshape_kernel.cl", build_options_); + } + + void Run() override { + VLOG(4) << "reshape_compute run ... "; + + auto& param = *param_.get_mutable(); + const Tensor* const x = param.x; + + const auto x_dims = x->dims(); + const std::map& input_image_shape = + InitImageDimInfoWith(x_dims); + + const int64_t& input_image_width = input_image_shape.at("width"); + const int64_t& input_image_height = input_image_shape.at("height"); + + const cl::Image2D* const x_image = x->data(); + + const std::vector& shape_vct = param.shape_vct; + Tensor* const output = param.output; + const DDimLite& out_dims = output->dims(); + VLOG(4) << "out_dims= " << out_dims; + + const std::map& out_image_shape = + InitImageDimInfoWith(out_dims); + cl::Image2D* const out_image = output->mutable_data( + out_image_shape.at("width"), out_image_shape.at("height")); + LOG(INFO) << "out_dims= " << out_dims; + + const std::vector& default_work_size = DefaultWorkSize( + out_dims, + DDim(std::vector{ + static_cast(out_image_shape.at("width")), + static_cast(out_image_shape.at("height"))})); + + int x_v_dims[4] = {1, 1, 1, 1}; + int out_v_dims[4] = {1, 1, 1, 1}; + // 1 1000 1 1 + for (int i = 0; i < x_dims.size(); i++) { + x_v_dims[4 - x_dims.size() + i] = x_dims[i]; + } + // 1 1 1 1000 + for (int i = 0; i < out_dims.size(); i++) { + out_v_dims[4 - out_dims.size() + i] = out_dims[i]; + } + + int out_C = out_v_dims[1]; + int out_H = out_v_dims[2]; + int out_W = out_v_dims[3]; + int in_W = x_v_dims[3]; + int in_H = x_v_dims[2]; + int in_Stride0 = in_W; + int in_Stride1 = x_v_dims[2] * x_v_dims[3]; + int in_Stride2 = x_v_dims[1] * x_v_dims[2] * x_v_dims[3]; + int out_Stride0 = out_W; + int out_Stride1 = out_H * out_W; + int out_Stride2 = out_C * out_H * out_W; + VLOG(4) << "out_C=" << out_C; + VLOG(4) << "out_H=" << out_H; + VLOG(4) << "out_W=" << out_W; + VLOG(4) << "in_W=" << in_W; + VLOG(4) << "default_work_size= " << default_work_size[0] << ", " + << default_work_size[1] << ", " << default_work_size[2]; + VLOG(4) << "in_Stride0=" << in_Stride0; + VLOG(4) << "in_Stride1=" << in_Stride1; + VLOG(4) << "out_Stride0=" << out_Stride0; + VLOG(4) << "out_Stride1=" << out_Stride1; + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + VLOG(4) << TargetToStr(x->target()); + VLOG(4) << TargetToStr(param.output->target()); + + int arg_idx = 0; + + cl_int status; + status = kernel.setArg(arg_idx, *x_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *out_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_C); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_H); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_W); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, in_W); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, in_H); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, in_Stride0); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, in_Stride1); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, in_Stride2); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_Stride0); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_Stride1); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_Stride2); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size.data()[0]), + static_cast(default_work_size.data()[1]), + static_cast(default_work_size.data()[2])}; + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_image, event_); + } + + private: + std::string kernel_func_name_{"reshape"}; + std::string build_options_{"-DCL_DTYPE_float "}; + std::shared_ptr event_{new cl::Event}; +}; + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(reshape, + kOpenCL, + kFloat, + kImageDefault, + paddle::lite::kernels::opencl::ReshapeComputeFloatImage, + image2d) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault))}) + .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))}) + .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(reshape2, + kOpenCL, + kFloat, + kImageDefault, + paddle::lite::kernels::opencl::ReshapeComputeFloatImage, + image2d) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault))}) + .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))}) + .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))}) + .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault))}) + .Finalize(); diff --git a/lite/kernels/opencl/reshape_compute_test.cc b/lite/kernels/opencl/reshape_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d5ba1c118e7fa952fe1172080ee97555a82c7260 --- /dev/null +++ b/lite/kernels/opencl/reshape_compute_test.cc @@ -0,0 +1,227 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/opencl/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/operators/reshape_op.h" +#include "lite/utils/logging.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { +static DDim ValidateShape(const std::vector& shape, + const DDim& input_dims) { + const lite::DDim::value_type input_size = input_dims.production(); + auto input_shape = input_dims.Vectorize(); + bool all_positive = std::all_of( + input_shape.cbegin(), input_shape.cend(), [](lite::DDim::value_type i) { + return i > 0; + }); + // only one dimension can be set to -1, whose size will be automatically + // infered. + const int unk_dim_val = -1; + const int copy_dim_val = 0; + + std::vector output_shape(shape.size(), 0); + lite::DDim::value_type capacity = 1; + int unk_dim_idx = -1; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == unk_dim_val) { + CHECK_EQ(unk_dim_idx, -1) + << "Only one input dimension of Attr(shape) can be unknown."; + unk_dim_idx = i; + } else if (shape[i] == copy_dim_val) { + CHECK_LT(static_cast(i), input_shape.size()) + << "The index of dimension to copy from input shape must be less " + "than the size of input shape."; + } else { + CHECK_GT(shape[i], 0) << "Each input dimension of Attr(shape) must not " + "be negtive except one unknown dimension."; + } + + capacity *= (shape[i] ? static_cast(shape[i]) + : input_shape[i]); + output_shape[i] = (shape[i] ? static_cast(shape[i]) + : input_shape[i]); + } + + if (unk_dim_idx != -1) { + if (all_positive) { + // input_size < 0 and is un-determinate in compile time, skip the check, + // for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8], + // capacity = -24, input_size = -8, output_shape[0] = 0 + // the following check will fail. + output_shape[unk_dim_idx] = -input_size / capacity; + CHECK_EQ(output_shape[unk_dim_idx] * capacity, -input_size) + << "Invalid shape is given."; + } else { + output_shape[unk_dim_idx] = -1; + } + } else { + CHECK_EQ(capacity, input_size) << "Invalid shape is given."; + } + return lite::DDim(output_shape); +} + +TEST(reshape_opencl, compute) { + LOG(INFO) << "to get kernel ..."; + auto kernels = KernelRegistry::Global().Create( + "reshape", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + + LOG(INFO) << "created reshape kernel"; + + LOG(INFO) << "prepare kernel ------"; + + int64_t batch_size = 1; + int64_t ic = 2; + int64_t ih = 4; + int64_t iw = 6; + + lite::Tensor input, output; + + operators::ReshapeParam param; + + Tensor shape_tensor; + shape_tensor.Resize({2}); + auto* shape_tensor_data = shape_tensor.mutable_data(); + shape_tensor_data[0] = 6; + shape_tensor_data[1] = 8; + + param.x = &input; + param.shape_tensor = &shape_tensor; // use shape_tensor + param.inplace = false; + param.output = &output; + + const DDim input_dim = + lite::DDim{std::vector({batch_size, ic, ih, iw})}; + input.Resize(input_dim); + + std::vector final_shape = std::vector( + shape_tensor_data, shape_tensor_data + shape_tensor.numel()); + + auto output_dim = ValidateShape(final_shape, input_dim); + param.output->Resize(output_dim); + LOG(INFO) << " output_dim------" << output_dim; + + LOG(INFO) << "prepare kernel SetParam------"; + kernel->SetParam(param); + + size_t input_image_width = iw * ((ic + 3) / 4); + size_t input_image_height = ih * batch_size; + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + + // LOG(INFO) << "map input ..."; + // auto* mapped_input = + // static_cast(TargetWrapperCL::MapImage(input_data, + // input_image_width, + // input_image_height, + // cl_image2d_row_pitch, + // cl_image2d_slice_pitch)); + + std::default_random_engine engine; + std::uniform_real_distribution gen(-5, 5); + std::vector input_v(batch_size * ic * ih * iw); + + LOG(INFO) << "gen input ..."; + + float* input_v_data = &input_v[0]; + for (auto& i : input_v) { + i = gen(engine); + } + paddle::lite::CLImageConverterDefault default_convertor; + + std::vector x_image_data(input_image_width * input_image_height * + 4); // 4 : RGBA + + LOG(INFO) << "set mapped input ..."; + default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim); + + auto* input_image = input.mutable_data( + input_image_width, input_image_height, x_image_data.data()); + + LOG(INFO) << "prepare kernel ready"; + + LOG(INFO) << "mutable output ..."; + CLImageConverterDefault default_converter; + DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim); + LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " " + << out_image_shape[1]; + auto* out_image = output.mutable_data(out_image_shape[0], + out_image_shape[1]); + VLOG(4) << "out_dims= " << output_dim; + + LOG(INFO) << "kernel context ..."; + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + std::unique_ptr reshape_context(new KernelContext); + context->As().CopySharedTo( + &(reshape_context->As())); + kernel->SetContext(std::move(reshape_context)); + + LOG(INFO) << "kernel launch ..."; + kernel->Launch(); + + auto* wait_list = context->As().cl_wait_list(); + auto* out_ptr = param.output->data(); + auto it = wait_list->find(out_image); + + if (it != wait_list->end()) { + VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the target cl tensor."; + } + + float* out_image_data = new float[out_image_shape.production() * 4]; + TargetWrapperCL::ImgcpySync(out_image_data, + output.data(), + out_image_shape[0], + out_image_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + float* out_data = new float[out_image_shape.production() * 4]; + default_converter.ImageToNCHW( + out_image_data, out_data, out_image_shape, output_dim); + // check output dims + for (int i = 0; i < output.dims().size(); i++) { + CHECK_EQ(output.dims()[i], shape_tensor_data[i]); + } + + // check output data + for (int i = 0; i < output.numel(); i++) { + EXPECT_NEAR(out_data[i], input_v_data[i], 1e-3); + if (abs(out_data[i] - input_v_data[i]) > 1e-3) { + LOG(INFO) << "error idx:" << i; + } + } +} + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(reshape, kOpenCL, kFloat, kImageDefault, image2d); +USE_LITE_KERNEL(reshape2, kOpenCL, kFloat, kImageDefault, image2d);