提交 3368a1e0 编写于 作者: X xiebaiyuan 提交者: GitHub

Develop lite reshape (#2613)

* add reshape opencl kernel && optimise conv 1x1 ,test=develop

* add reshape opencl kernel && optimise conv 1x1 &&code style ,test=develop

* add reshape opencl kernel && optimise conv 1x1 &&code style ,test=develop
上级 cec596d5
#include <cl_common.h> #include <cl_common.h>
__kernel void conv2d_1x1( __kernel void conv2d_1x1(__private const int global_size_dim0,
__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
...@@ -9,9 +8,10 @@ __kernel void conv2d_1x1( ...@@ -9,9 +8,10 @@ __kernel void conv2d_1x1(
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase, __read_only image2d_t new_scale,
#endif __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int offset, __private const int offset,
...@@ -61,12 +61,13 @@ __kernel void conv2d_1x1( ...@@ -61,12 +61,13 @@ __kernel void conv2d_1x1(
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH #ifdef BIASE_CH
CL_DTYPE4 output0 = read_imagef(bias, sampler, (int2)(out_c, 0)); CL_DTYPE4 output0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
CL_DTYPE4 output1 = output0; CL_DTYPE4 output1 = output0;
CL_DTYPE4 output2 = output0; CL_DTYPE4 output2 = output0;
CL_DTYPE4 output3 = output0; CL_DTYPE4 output3 = output0;
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
CL_DTYPE4 output0 = read_imagef(bias, sampler, output_pos0); CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
CL_DTYPE4 output1 = output0; CL_DTYPE4 output1 = output0;
CL_DTYPE4 output2 = output0; CL_DTYPE4 output2 = output0;
CL_DTYPE4 output3 = output0; CL_DTYPE4 output3 = output0;
...@@ -89,12 +90,17 @@ __kernel void conv2d_1x1( ...@@ -89,12 +90,17 @@ __kernel void conv2d_1x1(
// ------------0--------------- // ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y); in_pos_in_one_block0.y);
CL_DTYPE4 input0 = read_imagef(input_image, sampler, pos_in); CL_DTYPE4 input0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
CL_DTYPE4 weight0 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 0));
CL_DTYPE4 weight1 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 1)); CL_DTYPE4 weight0 =
CL_DTYPE4 weight2 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 2)); READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
CL_DTYPE4 weight3 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 3)); CL_DTYPE4 weight1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
CL_DTYPE4 weight2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
CL_DTYPE4 weight3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
int bound_gap = max_w_bound - pos_in.x - 1; int bound_gap = max_w_bound - pos_in.x - 1;
bool outof_bound = bound_gap < input_width && bound_gap >= 0; bool outof_bound = bound_gap < input_width && bound_gap >= 0;
...@@ -109,7 +115,8 @@ __kernel void conv2d_1x1( ...@@ -109,7 +115,8 @@ __kernel void conv2d_1x1(
// -------------1-------------- // -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y); in_pos_in_one_block1.y);
CL_DTYPE4 input1 = read_imagef(input_image, sampler, pos_in); CL_DTYPE4 input1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
bound_gap = max_w_bound - pos_in.x - 1; bound_gap = max_w_bound - pos_in.x - 1;
...@@ -126,7 +133,8 @@ __kernel void conv2d_1x1( ...@@ -126,7 +133,8 @@ __kernel void conv2d_1x1(
// -------------2-------------- // -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y); in_pos_in_one_block2.y);
CL_DTYPE4 input2 = read_imagef(input_image, sampler, pos_in); CL_DTYPE4 input2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
bound_gap = max_w_bound - pos_in.x - 1; bound_gap = max_w_bound - pos_in.x - 1;
...@@ -143,7 +151,8 @@ __kernel void conv2d_1x1( ...@@ -143,7 +151,8 @@ __kernel void conv2d_1x1(
// -------------3-------------- // -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y); in_pos_in_one_block3.y);
CL_DTYPE4 input3 = read_imagef(input_image, sampler, pos_in); CL_DTYPE4 input3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
bound_gap = max_w_bound - pos_in.x - 1; bound_gap = max_w_bound - pos_in.x - 1;
outof_bound = bound_gap < input_width && bound_gap >= 0; outof_bound = bound_gap < input_width && bound_gap >= 0;
...@@ -165,17 +174,21 @@ __kernel void conv2d_1x1( ...@@ -165,17 +174,21 @@ __kernel void conv2d_1x1(
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output0 = output0 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) + output0 = output0 * READ_IMG_TYPE(
read_imagef(new_biase, sampler, (int2)(out_c, 0)); CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) + output1 = output1 * READ_IMG_TYPE(
read_imagef(new_biase, sampler, (int2)(out_c, 0)); CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) + output2 = output2 * READ_IMG_TYPE(
read_imagef(new_biase, sampler, (int2)(out_c, 0)); CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) + output3 = output3 * READ_IMG_TYPE(
read_imagef(new_biase, sampler, (int2)(out_c, 0)); CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
...@@ -186,18 +199,18 @@ __kernel void conv2d_1x1( ...@@ -186,18 +199,18 @@ __kernel void conv2d_1x1(
#endif #endif
if (out_w0 < old_w) { if (out_w0 < old_w) {
write_imagef(output_image, output_pos0, output0); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
} }
if (out_w1 < old_w) { if (out_w1 < old_w) {
write_imagef(output_image, output_pos1, output1); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
} }
if (out_w2 < old_w) { if (out_w2 < old_w) {
write_imagef(output_image, output_pos2, output2); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
} }
if (out_w3 < old_w) { if (out_w3 < old_w) {
write_imagef(output_image, output_pos3, output3); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
} }
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void reshape(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W,
__private const int in_H,
__private const int in_Stride0,
__private const int in_Stride1,
__private const int in_Stride2,
__private const int out_Stride0,
__private const int out_Stride1,
__private const int out_Stride2) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;
const int out_c0 = out_c * 4;
const int out_c1 = out_c * 4 + 1;
const int out_c2 = out_c * 4 + 2;
const int out_c3 = out_c * 4 + 3;
int count0 =
out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
int count1 =
out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
int count2 =
out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
int count3 =
out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
int in_n0 = count0 / in_Stride2;
int in_n1 = count1 / in_Stride2;
int in_n2 = count1 / in_Stride2;
int in_n3 = count2 / in_Stride2;
count0 = count0 % in_Stride2;
count1 = count1 % in_Stride2;
count2 = count2 % in_Stride2;
count3 = count3 % in_Stride2;
int in_c0 = count0 / in_Stride1;
int in_c1 = count1 / in_Stride1;
int in_c2 = count2 / in_Stride1;
int in_c3 = count3 / in_Stride1;
int in_h0 = (count0 % in_Stride1) / in_Stride0;
int in_h1 = (count1 % in_Stride1) / in_Stride0;
int in_h2 = (count2 % in_Stride1) / in_Stride0;
int in_h3 = (count3 % in_Stride1) / in_Stride0;
int in_w0 = (count0 % in_Stride1) % in_Stride0;
int in_w1 = (count1 % in_Stride1) % in_Stride0;
int in_w2 = (count2 % in_Stride1) % in_Stride0;
int in_w3 = (count3 % in_Stride1) % in_Stride0;
int2 input_pos0;
int2 input_pos1;
int2 input_pos2;
int2 input_pos3;
input_pos0.x = (in_c0 / 4) * in_W + in_w0;
input_pos0.y = in_n0 * in_H + in_h0;
input_pos1.x = (in_c1 / 4) * in_W + in_w1;
input_pos1.y = in_n1 * in_H + in_h1;
input_pos2.x = (in_c2 / 4) * in_W + in_w2;
input_pos2.y = in_n2 * in_H + in_h2;
input_pos3.x = (in_c3 / 4) * in_W + in_w3;
input_pos3.y = in_n3 * in_H + in_h3;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 input0;
CL_DTYPE4 input1;
CL_DTYPE4 input2;
CL_DTYPE4 input3;
CL_DTYPE4 output;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
if (in_c0 % 4 == 0) {
output.x = input0.x;
} else if (in_c0 % 4 == 1) {
output.x = input0.y;
} else if (in_c0 % 4 == 2) {
output.x = input0.z;
} else {
output.x = input0.w;
}
if (out_C - out_c * 4 >= 2) {
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
if (in_c1 % 4 == 0) {
output.y = input1.x;
} else if (in_c1 % 4 == 1) {
output.y = input1.y;
} else if (in_c1 % 4 == 2) {
output.y = input1.z;
} else {
output.y = input1.w;
}
} else {
output.y = 0.0f;
}
if (out_C - out_c * 4 >= 3) {
input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
if (in_c2 % 4 == 0) {
output.z = input2.x;
} else if (in_c2 % 4 == 1) {
output.z = input1.y;
} else if (in_c2 % 4 == 2) {
output.z = input2.z;
} else {
output.z = input2.w;
}
} else {
output.z = 0.0f;
}
if (out_C - out_c * 4 >= 4) {
input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
if (in_c3 % 4 == 0) {
output.w = input3.x;
} else if (in_c3 % 4 == 1) {
output.w = input3.y;
} else if (in_c3 % 4 == 2) {
output.w = input3.z;
} else {
output.w = input3.w;
}
} else {
output.w = 0.0f;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
...@@ -15,6 +15,7 @@ add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${te ...@@ -15,6 +15,7 @@ add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${te
add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps}) add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps}) add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
...@@ -51,7 +52,9 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc ...@@ -51,7 +52,9 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc
DEPS conv2d_1x1_opencl cl_image_converter op_registry program context DEPS conv2d_1x1_opencl cl_image_converter op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
DEPS reshape_opencl cl_image_converter op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
DEPS conv_opencl op_registry program context DEPS conv_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
// reshape operator
class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ReshapeParam;
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/reshape_kernel.cl", build_options_);
}
void Run() override {
VLOG(4) << "reshape_compute run ... ";
auto& param = *param_.get_mutable<param_t>();
const Tensor* const x = param.x;
const auto x_dims = x->dims();
const std::map<std::string, size_t>& input_image_shape =
InitImageDimInfoWith(x_dims);
const int64_t& input_image_width = input_image_shape.at("width");
const int64_t& input_image_height = input_image_shape.at("height");
const cl::Image2D* const x_image = x->data<float, cl::Image2D>();
const std::vector<int>& shape_vct = param.shape_vct;
Tensor* const output = param.output;
const DDimLite& out_dims = output->dims();
VLOG(4) << "out_dims= " << out_dims;
const std::map<std::string, size_t>& out_image_shape =
InitImageDimInfoWith(out_dims);
cl::Image2D* const out_image = output->mutable_data<float, cl::Image2D>(
out_image_shape.at("width"), out_image_shape.at("height"));
LOG(INFO) << "out_dims= " << out_dims;
const std::vector<size_t>& default_work_size = DefaultWorkSize(
out_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape.at("width")),
static_cast<int64_t>(out_image_shape.at("height"))}));
int x_v_dims[4] = {1, 1, 1, 1};
int out_v_dims[4] = {1, 1, 1, 1};
// 1 1000 1 1
for (int i = 0; i < x_dims.size(); i++) {
x_v_dims[4 - x_dims.size() + i] = x_dims[i];
}
// 1 1 1 1000
for (int i = 0; i < out_dims.size(); i++) {
out_v_dims[4 - out_dims.size() + i] = out_dims[i];
}
int out_C = out_v_dims[1];
int out_H = out_v_dims[2];
int out_W = out_v_dims[3];
int in_W = x_v_dims[3];
int in_H = x_v_dims[2];
int in_Stride0 = in_W;
int in_Stride1 = x_v_dims[2] * x_v_dims[3];
int in_Stride2 = x_v_dims[1] * x_v_dims[2] * x_v_dims[3];
int out_Stride0 = out_W;
int out_Stride1 = out_H * out_W;
int out_Stride2 = out_C * out_H * out_W;
VLOG(4) << "out_C=" << out_C;
VLOG(4) << "out_H=" << out_H;
VLOG(4) << "out_W=" << out_W;
VLOG(4) << "in_W=" << in_W;
VLOG(4) << "default_work_size= " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
VLOG(4) << "in_Stride0=" << in_Stride0;
VLOG(4) << "in_Stride1=" << in_Stride1;
VLOG(4) << "out_Stride0=" << out_Stride0;
VLOG(4) << "out_Stride1=" << out_Stride1;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(x->target());
VLOG(4) << TargetToStr(param.output->target());
int arg_idx = 0;
cl_int status;
status = kernel.setArg(arg_idx, *x_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_C);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_H);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_W);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, in_W);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, in_H);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, in_Stride0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, in_Stride1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, in_Stride2);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_Stride0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_Stride1);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_Stride2);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
private:
std::string kernel_func_name_{"reshape"};
std::string build_options_{"-DCL_DTYPE_float "};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(reshape,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(reshape2,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/operators/reshape_op.h"
#include "lite/utils/logging.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
static DDim ValidateShape(const std::vector<int>& shape,
const DDim& input_dims) {
const lite::DDim::value_type input_size = input_dims.production();
auto input_shape = input_dims.Vectorize();
bool all_positive = std::all_of(
input_shape.cbegin(), input_shape.cend(), [](lite::DDim::value_type i) {
return i > 0;
});
// only one dimension can be set to -1, whose size will be automatically
// infered.
const int unk_dim_val = -1;
const int copy_dim_val = 0;
std::vector<lite::DDim::value_type> output_shape(shape.size(), 0);
lite::DDim::value_type capacity = 1;
int unk_dim_idx = -1;
for (size_t i = 0; i < shape.size(); ++i) {
if (shape[i] == unk_dim_val) {
CHECK_EQ(unk_dim_idx, -1)
<< "Only one input dimension of Attr(shape) can be unknown.";
unk_dim_idx = i;
} else if (shape[i] == copy_dim_val) {
CHECK_LT(static_cast<int>(i), input_shape.size())
<< "The index of dimension to copy from input shape must be less "
"than the size of input shape.";
} else {
CHECK_GT(shape[i], 0) << "Each input dimension of Attr(shape) must not "
"be negtive except one unknown dimension.";
}
capacity *= (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
: input_shape[i]);
output_shape[i] = (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
: input_shape[i]);
}
if (unk_dim_idx != -1) {
if (all_positive) {
// input_size < 0 and is un-determinate in compile time, skip the check,
// for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
// capacity = -24, input_size = -8, output_shape[0] = 0
// the following check will fail.
output_shape[unk_dim_idx] = -input_size / capacity;
CHECK_EQ(output_shape[unk_dim_idx] * capacity, -input_size)
<< "Invalid shape is given.";
} else {
output_shape[unk_dim_idx] = -1;
}
} else {
CHECK_EQ(capacity, input_size) << "Invalid shape is given.";
}
return lite::DDim(output_shape);
}
TEST(reshape_opencl, compute) {
LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create(
"reshape", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
LOG(INFO) << "created reshape kernel";
LOG(INFO) << "prepare kernel ------";
int64_t batch_size = 1;
int64_t ic = 2;
int64_t ih = 4;
int64_t iw = 6;
lite::Tensor input, output;
operators::ReshapeParam param;
Tensor shape_tensor;
shape_tensor.Resize({2});
auto* shape_tensor_data = shape_tensor.mutable_data<int>();
shape_tensor_data[0] = 6;
shape_tensor_data[1] = 8;
param.x = &input;
param.shape_tensor = &shape_tensor; // use shape_tensor
param.inplace = false;
param.output = &output;
const DDim input_dim =
lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
input.Resize(input_dim);
std::vector<int> final_shape = std::vector<int>(
shape_tensor_data, shape_tensor_data + shape_tensor.numel());
auto output_dim = ValidateShape(final_shape, input_dim);
param.output->Resize(output_dim);
LOG(INFO) << " output_dim------" << output_dim;
LOG(INFO) << "prepare kernel SetParam------";
kernel->SetParam(param);
size_t input_image_width = iw * ((ic + 3) / 4);
size_t input_image_height = ih * batch_size;
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
// LOG(INFO) << "map input ...";
// auto* mapped_input =
// static_cast<float*>(TargetWrapperCL::MapImage(input_data,
// input_image_width,
// input_image_height,
// cl_image2d_row_pitch,
// cl_image2d_slice_pitch));
std::default_random_engine engine;
std::uniform_real_distribution<float> gen(-5, 5);
std::vector<float> input_v(batch_size * ic * ih * iw);
LOG(INFO) << "gen input ...";
float* input_v_data = &input_v[0];
for (auto& i : input_v) {
i = gen(engine);
}
paddle::lite::CLImageConverterDefault default_convertor;
std::vector<float> x_image_data(input_image_width * input_image_height *
4); // 4 : RGBA
LOG(INFO) << "set mapped input ...";
default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
auto* input_image = input.mutable_data<float, cl::Image2D>(
input_image_width, input_image_height, x_image_data.data());
LOG(INFO) << "prepare kernel ready";
LOG(INFO) << "mutable output ...";
CLImageConverterDefault default_converter;
DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = output.mutable_data<float, cl::Image2D>(out_image_shape[0],
out_image_shape[1]);
VLOG(4) << "out_dims= " << output_dim;
LOG(INFO) << "kernel context ...";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
std::unique_ptr<KernelContext> reshape_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(reshape_context->As<OpenCLContext>()));
kernel->SetContext(std::move(reshape_context));
LOG(INFO) << "kernel launch ...";
kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>();
auto it = wait_list->find(out_image);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
float* out_image_data = new float[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
output.data<float, cl::Image2D>(),
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
float* out_data = new float[out_image_shape.production() * 4];
default_converter.ImageToNCHW(
out_image_data, out_data, out_image_shape, output_dim);
// check output dims
for (int i = 0; i < output.dims().size(); i++) {
CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
}
// check output data
for (int i = 0; i < output.numel(); i++) {
EXPECT_NEAR(out_data[i], input_v_data[i], 1e-3);
if (abs(out_data[i] - input_v_data[i]) > 1e-3) {
LOG(INFO) << "error idx:" << i;
}
}
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(reshape, kOpenCL, kFloat, kImageDefault, image2d);
USE_LITE_KERNEL(reshape2, kOpenCL, kFloat, kImageDefault, image2d);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册