未验证 提交 fbffa674 编写于 作者: X xiebaiyuan 提交者: GitHub

[opencl] expand opencl kernel & unit test (#3742)

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop expend opencl kernel & unit test ,test=develop

* [OPENCL] develop expend opencl kernel & unit test ,test=develop
上级 154021ad
#include <cl_common.h>
__kernel void expend_c1(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
in.y = 0;
in.z = 0;
in.w = 0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_c2(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
in.z = 0;
in.w = 0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_c3(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
in.w = 0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_c4(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_cn(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = out_c;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
\ No newline at end of file
......@@ -35,6 +35,8 @@ add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_k
add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
# extra
# wait to add ...
......@@ -77,6 +79,9 @@ lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc
DEPS pixel_shuffle_opencl op_registry program context)
lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
DEPS expand_opencl op_registry program context)
lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ExpandComputeImage2D : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ExpandParam;
std::string doc() const override { return "expand using cl::Image2D, kFP16"; }
void PrepareForRun() override {
expand_param_ = param_.get_mutable<param_t>();
auto expand_times = expand_param_->expand_times;
auto in_dims = expand_param_->X->dims();
CHECK(in_dims.size() == 4) << "expand image now only support indims size 4";
CHECK(expand_times.size() == 4)
<< "expand image now only support in_expand_timesdims size 4";
CHECK(expand_times[1] == 1) << "expand image do not support expend c now";
// do not confuse with these cases.it is use to support expend c in future
if (in_dims[1] == 1) {
kernel_func_name_ = "expend_c1";
} else if (in_dims[1] == 2) {
kernel_func_name_ = "expend_c2";
} else if (in_dims[1] == 3) {
kernel_func_name_ = "expend_c3";
} else if (in_dims[1] == 4) {
kernel_func_name_ = "expend_c4";
} else {
kernel_func_name_ = "expend_cn";
}
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(kernel_func_name_,
"image/expand_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
void ReInitWhenNeeded() override {
VLOG(1) << "ReInitWhenNeeded: " << kernel_func_name_;
auto x_dims = expand_param_->X->dims();
auto out_dims = expand_param_->Out->dims();
auto expand_times = expand_param_->expand_times;
VLOG(1) << "x_dims: " << x_dims;
VLOG(1) << "out_dims: " << out_dims;
VLOG(1) << "expand_times: " << expand_times[0] << " " << expand_times[1]
<< " " << expand_times[2] << " " << expand_times[3];
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ = default_convertor.InitImageDimInfoWith(out_dims);
VLOG(1) << "out_img_shape_: " << out_img_shape_[0] << " "
<< out_img_shape_[1];
// compute global work size
auto image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
size_t work_size_0 = image_width / out_dims[3];
size_t work_size_1 = out_dims[3];
size_t work_size_2 = out_dims[0] * out_dims[2];
global_work_size_ = cl::NDRange{work_size_0, work_size_1, work_size_2};
VLOG(1) << "global_work_size_: " << global_work_size_[0] << " "
<< global_work_size_[1] << " " << global_work_size_[2];
}
}
void Run() override {
auto* x_img = expand_param_->X->data<half_t, cl::Image2D>();
auto* out_img = expand_param_->Out->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
auto expand_times = expand_param_->expand_times;
auto x_dims = expand_param_->X->dims();
int in_n = x_dims[0];
int in_c = x_dims[1];
int in_h = x_dims[2];
int in_w = x_dims[3];
auto out_dims = expand_param_->Out->dims();
int out_n = out_dims[0];
int out_c = out_dims[1];
int out_h = out_dims[2];
int out_w = out_dims[3];
auto out_image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
int out_c_block = out_image_width / out_dims[3];
int out_nh = out_dims[0] * out_dims[2];
auto in_image_width = x_dims[3] * ((x_dims[1] + 3) / 4);
int in_c_block = in_image_width / x_dims[3];
int in_nh = x_dims[0] * x_dims[2];
int expand_times_n = expand_times[0];
int expand_times_c = expand_times[1];
int expand_times_h = expand_times[2];
int expand_times_w = expand_times[3];
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto kernel = kernel_;
cl_int status;
status = kernel.setArg(0, out_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, out_nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, in_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(4, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(5, in_nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(6, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(7, in_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(8, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(9, out_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(10, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(11, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(12, expand_times_n);
CL_CHECK_FATAL(status);
status = kernel.setArg(13, expand_times_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(14, expand_times_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(15, expand_times_w);
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private:
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
param_t* expand_param_{nullptr};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(expand,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ExpandComputeImage2D,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册