diff --git a/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..1574cb4a69cd0388698707d8d91c1d9c18b625a2 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data, + int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) { + const int index = get_global_id(0); + if (index < size){ + for (int i = 0; i < pre_size; i++){ + int offset_out = index * post_size + i * total; + int offset_in = index * post_size + i * total0; + // memcpy(out_data + offset_out, x_data0 + offset_in, post_size); + CL_DTYPE* dst = out_data + offset_out; + CL_DTYPE* src = x_data0 + offset_in; + for (int k = 0; k < post_size; k++){ + *dst++ = *src++; + } + } + }else if (index < axis_size){ + for (int i = 0; i < pre_size; i++){ + int offset_out = index * post_size + i * total; + int offset_in = index * post_size + i * total1; + // memcpy(out_data + offset_out, x_data1 + offset_in, post_size); + CL_DTYPE* dst = out_data + offset_out; + CL_DTYPE* src = x_data1 + offset_in; + for (int k = 0; k < post_size; k++){ + *dst++ = *src++; + } + } + } +} + +__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data, + int axis_size, int pre_size, int post_size, int start, int total, int total0) { + const int index = get_global_id(0); + if (index < axis_size){ + for (int i = 0; i < pre_size; i++){ + int offset_out = (start + index) * post_size + i * total; + int offset_in = index * post_size + i * total0; + // memcpy(out_data + offset_out, x_data + offset_in, post_size); + CL_DTYPE* dst = out_data + offset_out; + CL_DTYPE* src = x_data + offset_in; + for (int k = 0; k < post_size; k++){ + *dst++ = *src++; + } + } + } +} diff --git a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..f0335116f87aac34740dd22ac68f2b6265e62445 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void concat2(__read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int axis_size, int flag, int width) { + const int x = get_global_id(0); // image_width cxw/4 + const int y = get_global_id(1); // image_height nxh + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + int xx = x / width; + if (flag == 0){ + xx = y / width; + } + if (xx < axis_size){ + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y)); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); + }else{ + int new_val = xx - axis_size; + new_val *= width; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y)); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); + } + // WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); +} + +__kernel void concat_mul(__read_only image2d_t input0, + __write_only image2d_t output, + int axis_size, int flag, int width, int start) { + const int x = get_global_id(0); // image_width cxw/4 + const int y = get_global_id(1); // image_height nxh + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + int xx = x / width; + if (flag == 0){ + xx = y / width; + } + + if (xx < axis_size && xx >= start){ + xx -= start; + xx *= width; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y)); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); + } + +} diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index a802d1a9ebafc81f1a2b2c04fc190d6a8a7818c3..9074a503dfd78289065c34f051010133cf41b552 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -20,6 +20,7 @@ add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps}) add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps}) lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc @@ -83,6 +84,11 @@ lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc DEPS layout_opencl op_registry program context cl_image_converter ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) + +lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc + DEPS concat_opencl layout_opencl op_registry program context + ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) + lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) diff --git a/lite/kernels/opencl/concat_compute.cc b/lite/kernels/opencl/concat_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f25439ed00a9ff579bbd59a543dba3c8c3b090b --- /dev/null +++ b/lite/kernels/opencl/concat_compute.cc @@ -0,0 +1,372 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/opencl/concat_compute.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" +#include "lite/operators/op_params.h" +#include "lite/utils/replace_stl/stream.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + +template <> +void ConcatCompute::PrepareForRun() { + auto& context = ctx_->As(); + concat_param_ = param_.get_mutable(); + if (concat_param_->x.size() == 2) { + kernel_func_name_ = "concat2"; + } else { + kernel_func_name_ = "concat_mul"; + } + context.cl_context()->AddKernel( + kernel_func_name_, "image/concat_kernel.cl", build_options_); + // UpdateParams(); + auto axis = concat_param_->axis; + auto inputs = concat_param_->x; + auto out_dims = concat_param_->output->dims(); + auto* axis_tensor = concat_param_->axis_tensor; + if (axis_tensor != nullptr) { + // auto* axis_tensor_data = axis_tensor->data(TARGET(kARM)); + // axis = axis_tensor_data[0]; + } + auto in_dims = inputs[0]->dims(); + axis_size_ = out_dims[axis]; + axis_ = axis; + for (int i = 0; i < axis; i++) { + pre_size_ *= in_dims[i]; + } + for (int i = axis + 1; i < in_dims.size(); i++) { + post_size_ *= in_dims[i]; + } + for (int i = 1; i < inputs.size(); i++) { + auto dims = inputs[i]->dims(); + // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size()); + if (in_dims.size() != dims.size()) { + printf("input shape must be same \n"); + return; + } + for (int i = 0; i < dims.size(); i++) { + if (i != axis) { + if (in_dims[i] != dims[i]) { + printf("input shape must be same \n"); + return; + } + } + } + } +} + +template <> +void ConcatCompute::Run() { + auto& param = *param_.get_mutable(); + const auto& x_dims = param.output->dims(); + auto image_shape = InitImageDimInfoWith(x_dims); + auto* out_buf = param.output->mutable_data( + image_shape["width"], image_shape["height"]); + const auto& y_dims = param.output->dims(); // useless: check dim only + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + + auto inputs = param.x; + int arg_idx = 0; + int width = inputs[0]->dims()[-1]; + auto global_work_size = + cl::NDRange{static_cast(image_shape["width"]), + static_cast(image_shape["height"])}; + VLOG(4) << TargetToStr(param.output->target()); + VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " + << image_shape["height"]; + VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " + << x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; + VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " + << y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + int flag = 1; // cxw + switch (axis_) { + case 0: + width = x_dims[2]; // n + flag = 0; + break; + case 1: + width = x_dims[3]; // c + break; + case 2: + width = x_dims[0]; // h + flag = 0; + break; + case 3: + case -1: + width = x_dims[1]; // w + break; + default: + printf("this axis: %d does not support \n", axis_); + } + if (inputs.size() == 2) { + auto* x_buf0 = inputs[0]->data(); + auto* x_buf1 = inputs[1]->data(); + cl_int status = kernel.setArg(arg_idx, *x_buf0); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *x_buf1); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *out_buf); + CL_CHECK_FATAL(status); + status = + kernel.setArg(++arg_idx, static_cast(inputs[0]->dims()[axis_])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, flag); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, width); + CL_CHECK_FATAL(status); + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_context()->GetCommandQueue().finish(); + } else { + auto start = 0; + for (int i = 0; i < inputs.size(); i++) { + arg_idx = 0; + auto* x_buf = inputs[i]->data(); + cl_int status = kernel.setArg(arg_idx, *x_buf); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *out_buf); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, axis_size_); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, start); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, flag); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, width); + CL_CHECK_FATAL(status); + CL_CHECK_FATAL(status); + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_context()->GetCommandQueue().finish(); + start += inputs[i]->dims()[axis_]; + } + } +} + +template <> +std::string ConcatCompute::doc() { + return "Concat using cl::Image, kFloat"; +} + +template <> +void ConcatCompute::PrepareForRun() { + auto& context = ctx_->As(); + concat_param_ = param_.get_mutable(); + if (concat_param_->x.size() == 2) { + kernel_func_name_ = "concat2"; + } else { + kernel_func_name_ = "concat_mul"; + } + context.cl_context()->AddKernel( + kernel_func_name_, "buffer/concat_kernel.cl", build_options_); + + // UpdateParams(); + auto axis = concat_param_->axis; + auto inputs = concat_param_->x; + auto out_dims = concat_param_->output->dims(); + auto* axis_tensor = concat_param_->axis_tensor; + if (axis_tensor != nullptr) { + // auto* axis_tensor_data = axis_tensor->data(TARGET(kARM)); + // axis = axis_tensor_data[0]; + } + auto in_dims = inputs[0]->dims(); + axis_size_ = out_dims[axis]; + axis_ = axis; + for (int i = 0; i < axis; i++) { + pre_size_ *= in_dims[i]; + } + for (int i = axis + 1; i < in_dims.size(); i++) { + post_size_ *= in_dims[i]; + } + for (int i = 1; i < inputs.size(); i++) { + auto dims = inputs[i]->dims(); + if (in_dims.size() != dims.size()) { + printf("input shape must be same \n"); + return; + } + for (int i = 0; i < dims.size(); i++) { + if (i != axis) { + if (in_dims[i] != dims[i]) { + printf("input shape must be same \n"); + return; + } + } + } + } +} + +template <> +void ConcatCompute::Run() { + auto& param = *param_.get_mutable(); + const auto& x_dims = param.output->dims(); + auto image_shape = InitImageDimInfoWith(x_dims); + auto* out_buf = + param.output->mutable_data(TARGET(kOpenCL)); + const auto& y_dims = param.output->dims(); // useless: check dim only + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + + auto inputs = param.x; + int arg_idx = 0; + auto global_work_size = cl::NDRange{axis_size_}; + int total = axis_size_ * post_size_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + if (inputs.size() == 2) { + auto* x_buf0 = inputs[0]->data(); + auto* x_buf1 = inputs[1]->data(); + auto axis0 = inputs[0]->dims()[axis_]; + int total0 = axis0 * post_size_; + int total1 = (axis_size_ - axis0) * post_size_; + cl_int status = kernel.setArg(arg_idx, *x_buf0); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *x_buf1); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *out_buf); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(axis0)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, axis_size_); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, pre_size_); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, post_size_); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, total); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, total0); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, total1); + CL_CHECK_FATAL(status); + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_buf, event_); + } else { + auto start = 0; + for (int i = 0; i < inputs.size(); i++) { + arg_idx = 0; + int size = inputs[i]->dims()[axis_]; + auto* x_buf = inputs[i]->data(); + global_work_size = cl::NDRange{static_cast(size)}; + int total0 = size * post_size_; + cl_int status = kernel.setArg(arg_idx, *x_buf); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *out_buf); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(size)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, pre_size_); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, post_size_); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, start); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, total); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, total0); + CL_CHECK_FATAL(status); + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_buf, event_); + start += size; + } + } +} + +template <> +std::string ConcatCompute::doc() { + return "Concat using cl::Buffer, kFloat"; +} + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +typedef paddle::lite::kernels::opencl::ConcatCompute + Concat_buffer; + +typedef paddle::lite::kernels::opencl::ConcatCompute + Concat_image; + +REGISTER_LITE_KERNEL( + concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kInt32), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kInt32), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/opencl/concat_compute.h b/lite/kernels/opencl/concat_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..7bed6a18146d76043fbfcd72236ba39c5607328b --- /dev/null +++ b/lite/kernels/opencl/concat_compute.h @@ -0,0 +1,54 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include "lite/core/kernel.h" +#include "lite/operators/op_params.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + +template +class ConcatCompute : public KernelLite { + public: + using param_t = operators::ConcatParam; + + void PrepareForRun() override; + + void Run() override; + + std::string doc(); // override; + + // protected: + // void UpdateParams(); + + int axis_size_ = 1; + int post_size_ = 1; + int pre_size_ = 1; + int axis_ = 1; + param_t* concat_param_{nullptr}; + std::string kernel_func_name_{}; + std::string build_options_{"-DCL_DTYPE_float"}; + std::shared_ptr event_{new cl::Event}; +}; + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/opencl/concat_compute_test.cc b/lite/kernels/opencl/concat_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..37e7b6658be2eaa60285474b3766ce462ea3779b --- /dev/null +++ b/lite/kernels/opencl/concat_compute_test.cc @@ -0,0 +1,390 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// // +// // Licensed under the Apache License, Version 2.0 (the "License"); +// // you may not use this file except in compliance with the License. +// // You may obtain a copy of the License at +// // +// // http://www.apache.org/licenses/LICENSE-2.0 +// // +// // Unless required by applicable law or agreed to in writing, software +// // distributed under the License is distributed on an "AS IS" BASIS, +// // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// // See the License for the specific language governing permissions and +// // limitations under the License. + +#include +#include +#include "lite/backends/opencl/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/kernels/opencl/image_helper.h" + +namespace paddle { +namespace lite { + +template +void concat2_compute_ref(const dtype *in0, + const dtype *in1, + const int axis, + const DDim in0_dim, + const DDim in1_dim, + const DDim out_dim, + dtype *out_data) { + int pre_size = 1; + int post_size = 1; + for (int i = 0; i < axis; i++) { + pre_size *= in0_dim[i]; + } + for (int i = axis + 1; i < in0_dim.size(); i++) { + post_size *= in0_dim[i]; + } + int axis_size = out_dim[axis]; + for (int i = 0; i < pre_size; i++) { + for (int j = 0; j < axis_size; j++) { + if (j < in0_dim[axis]) { + memcpy(out_data, in0, sizeof(dtype) * post_size); + in0 += post_size; + out_data += post_size; + } + } + } +} + +template +void concat_mul_compute_ref(std::vector ins_data, + std::vector ins_dim, + int axis, + const DDim out_dim, + dtype *out_data) { + int pre_size = 1; + int post_size = 1; + for (int i = 0; i < axis; i++) { + pre_size *= ins_dim[0][i]; + } + for (int i = axis + 1; i < ins_dim[0].size(); i++) { + post_size *= ins_dim[0][i]; + } + int axis_size = out_dim[axis]; + for (int i = 0; i < pre_size; i++) { + for (int j = 0; j < ins_data.size(); j++) { + int size = post_size * ins_dim[j][axis]; + memcpy(out_data, ins_data[j], sizeof(dtype) * size); + out_data += size; + } + } +} +#if 1 // concat_buffer +TEST(opencl_concat_buffer, compute) { + // prepare data + const DDim x0_dim = DDim(std::vector{1, 2, 3, 4}); + const DDim x1_dim = DDim(std::vector{1, 2, 3, 4}); + const DDim x2_dim = DDim(std::vector{1, 2, 3, 4}); + const DDim out_dim = DDim(std::vector{1, 6, 3, 4}); + lite::Tensor x0, x1, x2, out, out_ref; + x0.Resize(x0_dim); + x1.Resize(x1_dim); + x2.Resize(x2_dim); + out.Resize(out_dim); + out_ref.Resize(out_dim); + + auto *x0_data = x0.mutable_data(TARGET(kOpenCL)); + auto *x1_data = x1.mutable_data(TARGET(kOpenCL)); + auto *x2_data = x2.mutable_data(TARGET(kOpenCL)); + std::default_random_engine engine; + std::uniform_real_distribution dist(-10, 10); + auto *mapped_x0 = static_cast( + TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production())); + auto *mapped_x1 = static_cast( + TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production())); + auto *mapped_x2 = static_cast( + TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production())); + for (int i = 0; i < x0_dim.production(); i++) { + mapped_x0[i] = dist(engine); + } + for (int i = 0; i < x1_dim.production(); i++) { + mapped_x1[i] = dist(engine); + } + for (int i = 0; i < x2_dim.production(); i++) { + mapped_x2[i] = dist(engine); + } + + // set param and kernel, then run + operators::ConcatParam param; + std::vector ins; + ins.push_back(&x0); + ins.push_back(&x1); + ins.push_back(&x2); + auto axis = 1; + param.x = ins; + param.output = &out; + param.axis = axis; + + std::vector ins_data; + std::vector ins_dim; + + ins_data.push_back(mapped_x0); + ins_data.push_back(mapped_x1); + ins_data.push_back(mapped_x2); + ins_dim.push_back(x0_dim); + ins_dim.push_back(x1_dim); + ins_dim.push_back(x2_dim); + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + auto kernels = KernelRegistry::Global().Create( + "concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + kernel->SetParam(param); + std::unique_ptr concat_context(new KernelContext); + context->As().CopySharedTo( + &(concat_context->As())); + kernel->SetContext(std::move(concat_context)); + kernel->Launch(); + + auto *wait_list = context->As().cl_wait_list(); + auto *out_ptr = param.output->data(); + auto it = wait_list->find(out_ptr); + if (it != wait_list->end()) { + VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; + auto &event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the target cl tensor."; + } + + // run compute ref and check + auto *out_ref_data = out_ref.mutable_data(TARGET(kARM)); + concat_mul_compute_ref(ins_data, ins_dim, axis, out_dim, out_ref_data); + + auto *out_data = out.mutable_data(); + auto *mapped_out = static_cast( + TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production())); + for (int i = 0; i < out_dim.production(); i++) { + EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6); + } + TargetWrapperCL::Unmap(out_data, mapped_out); + TargetWrapperCL::Unmap(x0_data, mapped_x0); + TargetWrapperCL::Unmap(x1_data, mapped_x1); + TargetWrapperCL::Unmap(x2_data, mapped_x2); +} +#endif // concat_buffer + +// #define LOOP_TEST +// #define PRINT_RESULT +TEST(concat_image2d_fp32, compute) { + LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> " + "layout(img2buf) " + "-> host"; + +#ifdef LOOP_TEST + for (int n = 1; n <= 100; n += 33) { + for (auto c : {1, 3}) { + for (int h = 12; h <= 100; h += 13) { + for (int w = 12; w <= 100; w += 25) { + for (atuo &axis : {0, 1, 2, 3}) { +#else + const int n = 1; + const int c = 2; + const int h = 3; + const int w = 4; + const int axis = 1; +#endif // LOOP_TEST + LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c + << " " << h << " " << w << " ========"; + LOG(INFO) << "======== axis: " << axis; + // set layout kernels + auto buf_to_img_kernels = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + auto buf_to_img_kernels1 = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + auto img_to_buf_kernels = KernelRegistry::Global().Create( + "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); + auto concat_img_kernels = + KernelRegistry::Global().Create("concat", + TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(buf_to_img_kernels.empty()); + ASSERT_FALSE(buf_to_img_kernels1.empty()); + ASSERT_FALSE(img_to_buf_kernels.empty()); + ASSERT_FALSE(concat_img_kernels.empty()); + + auto buf_to_img_kernel = std::move(buf_to_img_kernels.front()); + auto buf_to_img_kernel1 = std::move(buf_to_img_kernels1.front()); + auto img_to_buf_kernel = std::move(img_to_buf_kernels.front()); + auto concat_img_kernel = std::move(concat_img_kernels.front()); + LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc(); + LOG(INFO) << "get 1st-1 kernel: " << buf_to_img_kernel1->doc(); + LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc(); + LOG(INFO) << "get 3rd kernel: " << concat_img_kernel->doc(); + + // set tensors about op param + LOG(INFO) << "set tensors about op param"; + lite::Tensor x0, x1, y, concat_in0, concat_in1, concat_out, y_ref; + operators::LayoutParam BufferToImageParam0, BufferToImageParam1; + operators::LayoutParam ImageToBufferParam; + BufferToImageParam0.x = &x0; + BufferToImageParam0.y = &concat_in0; + BufferToImageParam1.x = &x1; + BufferToImageParam1.y = &concat_in1; + ImageToBufferParam.x = &concat_out; + ImageToBufferParam.y = &y; + std::vector ins; + operators::ConcatParam concatParam; + ins.push_back(&concat_in0); + ins.push_back(&concat_in1); + concatParam.x = ins; + concatParam.axis = axis; + concatParam.output = &concat_out; + + const DDim x0_dim = DDim(std::vector{n, c, h, w}); + DDim x1_dim = DDim(std::vector{n, c, h, w}); + DDim out_dim = DDim(std::vector{n, c, h, w}); + x1_dim[axis] += 2; + out_dim[axis] = x0_dim[axis] + x1_dim[axis]; + x0.Resize(x0_dim); + x1.Resize(x1_dim); + y.Resize(out_dim); + concat_in0.Resize(x0_dim); + concat_in1.Resize(x1_dim); + concat_out.Resize(out_dim); + y_ref.Resize(out_dim); + auto concat_image2d_shape = + paddle::lite::kernels::opencl::InitImageDimInfoWith(out_dim); + auto concat_image2d_shape_in0 = + paddle::lite::kernels::opencl::InitImageDimInfoWith(x0_dim); + auto concat_image2d_shape_in1 = + paddle::lite::kernels::opencl::InitImageDimInfoWith(x1_dim); + + // initialize tensors + LOG(INFO) << "initialize tensors"; + auto *x_data0 = x0.mutable_data(TARGET(kOpenCL)); + auto *x_data1 = x1.mutable_data(TARGET(kOpenCL)); + auto *y_data = y.mutable_data(TARGET(kOpenCL)); + auto *y_data_ref = y_ref.mutable_data(TARGET(kARM)); + auto *mapped_x0 = static_cast(TargetWrapperCL::Map( + x_data0, 0, sizeof(float) * x0_dim.production())); + auto *mapped_x1 = static_cast(TargetWrapperCL::Map( + x_data1, 0, sizeof(float) * x1_dim.production())); + auto *mapped_y = static_cast(TargetWrapperCL::Map( + y_data, 0, sizeof(float) * out_dim.production())); + for (int i = 0; i < x0_dim.production(); ++i) { + mapped_x0[i] = static_cast(i) - x0_dim.production() / 2; + } + for (int i = 0; i < x1_dim.production(); ++i) { + mapped_x1[i] = static_cast(i) - x1_dim.production() / 2; + } + for (int i = 0; i < out_dim.production(); ++i) { + mapped_y[i] = static_cast(0); + } + auto *concat_in_data0 = concat_in0.mutable_data( + concat_image2d_shape_in0["width"], + concat_image2d_shape_in0["height"]); + auto *concat_in_data1 = concat_in1.mutable_data( + concat_image2d_shape_in1["width"], + concat_image2d_shape_in1["height"]); + auto *concat_out_data = concat_out.mutable_data( + concat_image2d_shape["width"], concat_image2d_shape["height"]); + + // set context and kernel args + LOG(INFO) << "set context and kernel args"; + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + buf_to_img_kernel->SetParam(BufferToImageParam0); + std::unique_ptr buf_to_img_context( + new KernelContext); + context->As().CopySharedTo( + &(buf_to_img_context->As())); + buf_to_img_kernel->SetContext(std::move(buf_to_img_context)); + buf_to_img_kernel1->SetParam(BufferToImageParam1); + std::unique_ptr buf_to_img_context1( + new KernelContext); + context->As().CopySharedTo( + &(buf_to_img_context1->As())); + buf_to_img_kernel1->SetContext(std::move(buf_to_img_context1)); + + img_to_buf_kernel->SetParam(ImageToBufferParam); + std::unique_ptr img_to_buf_context( + new KernelContext); + context->As().CopySharedTo( + &(img_to_buf_context->As())); + img_to_buf_kernel->SetContext(std::move(img_to_buf_context)); + + concat_img_kernel->SetParam(concatParam); + std::unique_ptr concat_img_context( + new KernelContext); + context->As().CopySharedTo( + &(concat_img_context->As())); + concat_img_kernel->SetContext(std::move(concat_img_context)); + + // run kernels + LOG(INFO) << "run kernel: buf_to_img_kernel"; + buf_to_img_kernel->Launch(); + buf_to_img_kernel1->Launch(); + LOG(INFO) << "run kernel: concat_img_kernel"; + concat_img_kernel->Launch(); + LOG(INFO) << "run kernel: img_to_buf_kernel"; + img_to_buf_kernel->Launch(); + + // compute ref cp_u + std::vector ins_ptr; + std::vector in_dim; + ins_ptr.push_back(mapped_x0); + ins_ptr.push_back(mapped_x1); + in_dim.push_back(x0_dim); + in_dim.push_back(x1_dim); + concat_mul_compute_ref( + ins_ptr, in_dim, axis, out_dim, y_data_ref); +// result +#ifdef PRINT_RESULT + LOG(INFO) << "---- print kernel result (input -> output) ----"; + for (int eidx = 0; eidx < out_dim.production(); ++eidx) { + std::cout << mapped_x0[eidx] << ", " << mapped_x1[eidx] << " -> " + << mapped_y[eidx] << std::endl; + } +#endif // PRINT_RESULT + + // check result: compare kernel output and cpu output(y_data_ref) + for (int eidx = 0; eidx < out_dim.production(); eidx++) { + EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6); + if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) { + LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx + << " / " << x0_dim.production() << ", y_data_ref[" + << eidx << "]:" << y_data_ref[eidx] << ", mapped_y[" + << eidx << "]:" << mapped_y[eidx]; + break; + } + } + // free + LOG(INFO) << "free: unmap x, y"; + TargetWrapperCL::Unmap(x_data0, mapped_x0); + TargetWrapperCL::Unmap(x_data1, mapped_x1); + TargetWrapperCL::Unmap(y_data, mapped_y); +#ifdef LOOP_TEST + } // axis + } // w + } // h + } // c + } // n +#else +// nothing to do. +#endif +} +} // namespace lite +} // namespace paddle + +// concat buffer +USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def); + +// concat image2d fp32 +USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); +USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); +USE_LITE_KERNEL(concat, kOpenCL, kFloat, kImageDefault, ImageDefault); diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc index df238ceae9e39541fb954d9262832d01cd9d3b7f..ceb35ffb6e4c728904d1f63f96f16434a561e904 100644 --- a/lite/tests/math/conv_compute_test.cc +++ b/lite/tests/math/conv_compute_test.cc @@ -307,7 +307,7 @@ void test_conv_fp32(const std::vector& input_dims, #endif // LITE_WITH_ARM // TODO(chenjiaoAngel): fix multi-threds, diff: 3x3 depthwise conv -#if 1 /// 3x3dw +#if 1 // 3x3dw TEST(TestConv3x3DW, test_conv3x3_depthwise) { if (FLAGS_basic_test) { for (auto& stride : {1, 2}) { @@ -449,7 +449,7 @@ TEST(TestConv3x3s1, test_conv_3x3s1) { dims.push_back(DDim({batch, cin, h, h})); } } - if (cin == 1 && cout ==1) { + if (cin == 1 && cout == 1) { continue; } const float leakey_relu_scale = 8.88;