未验证 提交 fbffa674 编写于 作者: X xiebaiyuan 提交者: GitHub

[opencl] expand opencl kernel & unit test (#3742)

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop

* [OPENCL] develop expend opencl kernel & unit test ,test=develop

* [OPENCL] develop expend opencl kernel & unit test ,test=develop
上级 154021ad
#include <cl_common.h>
__kernel void expend_c1(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
in.y = 0;
in.z = 0;
in.w = 0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_c2(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
in.z = 0;
in.w = 0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_c3(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
in.w = 0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_c4(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = 0;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
__kernel void expend_cn(__private const int OUT_C,
__private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C,
__private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input,
__write_only image2d_t output,
__private const int n_times,
__private const int c_times,
__private const int h_times,
__private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
const int in_c = out_c;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
}
\ No newline at end of file
......@@ -35,6 +35,8 @@ add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_k
add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
# extra
# wait to add ...
......@@ -77,6 +79,9 @@ lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc
DEPS pixel_shuffle_opencl op_registry program context)
lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
DEPS expand_opencl op_registry program context)
lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ExpandComputeImage2D : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ExpandParam;
std::string doc() const override { return "expand using cl::Image2D, kFP16"; }
void PrepareForRun() override {
expand_param_ = param_.get_mutable<param_t>();
auto expand_times = expand_param_->expand_times;
auto in_dims = expand_param_->X->dims();
CHECK(in_dims.size() == 4) << "expand image now only support indims size 4";
CHECK(expand_times.size() == 4)
<< "expand image now only support in_expand_timesdims size 4";
CHECK(expand_times[1] == 1) << "expand image do not support expend c now";
// do not confuse with these cases.it is use to support expend c in future
if (in_dims[1] == 1) {
kernel_func_name_ = "expend_c1";
} else if (in_dims[1] == 2) {
kernel_func_name_ = "expend_c2";
} else if (in_dims[1] == 3) {
kernel_func_name_ = "expend_c3";
} else if (in_dims[1] == 4) {
kernel_func_name_ = "expend_c4";
} else {
kernel_func_name_ = "expend_cn";
}
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(kernel_func_name_,
"image/expand_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
void ReInitWhenNeeded() override {
VLOG(1) << "ReInitWhenNeeded: " << kernel_func_name_;
auto x_dims = expand_param_->X->dims();
auto out_dims = expand_param_->Out->dims();
auto expand_times = expand_param_->expand_times;
VLOG(1) << "x_dims: " << x_dims;
VLOG(1) << "out_dims: " << out_dims;
VLOG(1) << "expand_times: " << expand_times[0] << " " << expand_times[1]
<< " " << expand_times[2] << " " << expand_times[3];
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ = default_convertor.InitImageDimInfoWith(out_dims);
VLOG(1) << "out_img_shape_: " << out_img_shape_[0] << " "
<< out_img_shape_[1];
// compute global work size
auto image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
size_t work_size_0 = image_width / out_dims[3];
size_t work_size_1 = out_dims[3];
size_t work_size_2 = out_dims[0] * out_dims[2];
global_work_size_ = cl::NDRange{work_size_0, work_size_1, work_size_2};
VLOG(1) << "global_work_size_: " << global_work_size_[0] << " "
<< global_work_size_[1] << " " << global_work_size_[2];
}
}
void Run() override {
auto* x_img = expand_param_->X->data<half_t, cl::Image2D>();
auto* out_img = expand_param_->Out->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
auto expand_times = expand_param_->expand_times;
auto x_dims = expand_param_->X->dims();
int in_n = x_dims[0];
int in_c = x_dims[1];
int in_h = x_dims[2];
int in_w = x_dims[3];
auto out_dims = expand_param_->Out->dims();
int out_n = out_dims[0];
int out_c = out_dims[1];
int out_h = out_dims[2];
int out_w = out_dims[3];
auto out_image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
int out_c_block = out_image_width / out_dims[3];
int out_nh = out_dims[0] * out_dims[2];
auto in_image_width = x_dims[3] * ((x_dims[1] + 3) / 4);
int in_c_block = in_image_width / x_dims[3];
int in_nh = x_dims[0] * x_dims[2];
int expand_times_n = expand_times[0];
int expand_times_c = expand_times[1];
int expand_times_h = expand_times[2];
int expand_times_w = expand_times[3];
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto kernel = kernel_;
cl_int status;
status = kernel.setArg(0, out_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, out_nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, in_c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(4, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(5, in_nh);
CL_CHECK_FATAL(status);
status = kernel.setArg(6, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(7, in_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(8, out_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(9, out_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(10, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(11, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(12, expand_times_n);
CL_CHECK_FATAL(status);
status = kernel.setArg(13, expand_times_c);
CL_CHECK_FATAL(status);
status = kernel.setArg(14, expand_times_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(15, expand_times_w);
CL_CHECK_FATAL(status);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private:
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
param_t* expand_param_{nullptr};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(expand,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ExpandComputeImage2D,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <random>
#include <gtest/gtest.h>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle {
namespace lite {
TEST(expand_hw_image2d, compute) {
LOG(INFO) << "create kernel ...";
auto kernels = KernelRegistry::Global().Create(
"expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
const int INPUT_N = 1;
const int INPUT_C = 1;
const int INPUT_H = 2;
const int INPUT_W = 3;
const int EXPAND_N = 1;
const int EXPAND_C = 1;
const int EXPAND_H = 2;
const int EXPAND_W = 3;
auto kernel = std::move(kernels.front());
LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
lite::Tensor x, out;
operators::ExpandParam param;
param.X = &x;
param.Out = &out;
param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(pixel_shuffle_context->As<OpenCLContext>()));
kernel->SetContext(std::move(pixel_shuffle_context));
const DDim in_dim =
DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
INPUT_C * EXPAND_C,
INPUT_H * EXPAND_H,
INPUT_W * EXPAND_W});
LOG(INFO) << "in_dim: " << in_dim;
LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
LOG(INFO) << "out_dim: " << out_dim;
x.Resize(in_dim);
out.Resize(out_dim);
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-2, 2);
std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
int index = 0;
for (auto& i : input_v) {
i = index++;
}
VLOG(1) << "input_v ..... ";
for (size_t i = 0; i < input_v.size(); i++) {
VLOG(10) << input_v[i];
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter = new CLImageConverterDefault();
DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
<< x_image_shape[1];
std::vector<half_t> x_image_data(x_image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data());
VLOG(1) << "x_image_data ..... ";
for (size_t i = 0; i < x_image_data.size(); i++) {
VLOG(10) << Half2Float(x_image_data[i]);
}
DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
out_image_shape[1]);
kernel->Launch();
CLRuntime::Global()->command_queue().finish();
std::vector<float> out_data_v{0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0,
1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
5, 5, 5, 3, 3, 3, 4, 4, 4, 5, 5, 5};
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
half_t* out_image_data = new half_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
VLOG(1) << "out_image_data ..... ";
for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
VLOG(10) << Half2Float(out_image_data[i]);
}
float* out_data = new float[out_image_shape.production() * 4];
default_converter->ImageToNCHW(
out_image_data, out_data, out_image_shape, out_dim);
VLOG(1) << "out_data ..... ";
for (int i = 0; i < out_dim.production(); i++) {
VLOG(10) << out_data[i];
}
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(out_data[i] - out_data_v[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"out_ref["
<< i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
}
}
TEST(expand_c2hw_image2d, compute) {
LOG(INFO) << "create kernel ...";
auto kernels = KernelRegistry::Global().Create(
"expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
const int INPUT_N = 1;
const int INPUT_C = 2;
const int INPUT_H = 2;
const int INPUT_W = 3;
const int EXPAND_N = 1;
const int EXPAND_C = 1;
const int EXPAND_H = 2;
const int EXPAND_W = 1;
auto kernel = std::move(kernels.front());
LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
lite::Tensor x, out;
operators::ExpandParam param;
param.X = &x;
param.Out = &out;
param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(pixel_shuffle_context->As<OpenCLContext>()));
kernel->SetContext(std::move(pixel_shuffle_context));
const DDim in_dim =
DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
INPUT_C * EXPAND_C,
INPUT_H * EXPAND_H,
INPUT_W * EXPAND_W});
LOG(INFO) << "in_dim: " << in_dim;
LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
LOG(INFO) << "out_dim: " << out_dim;
x.Resize(in_dim);
out.Resize(out_dim);
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-2, 2);
std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
int index = 0;
for (auto& i : input_v) {
i = index++;
}
VLOG(1) << "input_v ..... ";
for (size_t i = 0; i < input_v.size(); i++) {
VLOG(10) << input_v[i];
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter = new CLImageConverterDefault();
DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
<< x_image_shape[1];
std::vector<half_t> x_image_data(x_image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data());
VLOG(1) << "x_image_data ..... ";
for (size_t i = 0; i < x_image_data.size(); i++) {
VLOG(10) << Half2Float(x_image_data[i]);
}
DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
out_image_shape[1]);
kernel->Launch();
CLRuntime::Global()->command_queue().finish();
std::vector<float> out_data_v{0, 1, 2, 0, 1, 2, 3, 4, 5, 3, 4, 5,
6, 7, 8, 6, 7, 8, 9, 10, 11, 9, 10, 11};
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
half_t* out_image_data = new half_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
VLOG(1) << "out_image_data ..... ";
for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
VLOG(10) << Half2Float(out_image_data[i]);
}
float* out_data = new float[out_image_shape.production() * 4];
default_converter->ImageToNCHW(
out_image_data, out_data, out_image_shape, out_dim);
VLOG(1) << "out_data ..... ";
for (int i = 0; i < out_dim.production(); i++) {
VLOG(10) << out_data[i];
}
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(out_data[i] - out_data_v[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"out_ref["
<< i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
}
}
TEST(expand_c3hw_image2d, compute) {
LOG(INFO) << "create kernel ...";
auto kernels = KernelRegistry::Global().Create(
"expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
const int INPUT_N = 1;
const int INPUT_C = 3;
const int INPUT_H = 2;
const int INPUT_W = 3;
const int EXPAND_N = 1;
const int EXPAND_C = 1;
const int EXPAND_H = 2;
const int EXPAND_W = 1;
auto kernel = std::move(kernels.front());
LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
lite::Tensor x, out;
operators::ExpandParam param;
param.X = &x;
param.Out = &out;
param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(pixel_shuffle_context->As<OpenCLContext>()));
kernel->SetContext(std::move(pixel_shuffle_context));
const DDim in_dim =
DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
INPUT_C * EXPAND_C,
INPUT_H * EXPAND_H,
INPUT_W * EXPAND_W});
LOG(INFO) << "in_dim: " << in_dim;
LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
LOG(INFO) << "out_dim: " << out_dim;
x.Resize(in_dim);
out.Resize(out_dim);
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-2, 2);
std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
int index = 0;
for (auto& i : input_v) {
i = index++;
}
VLOG(1) << "input_v ..... ";
for (size_t i = 0; i < input_v.size(); i++) {
VLOG(10) << input_v[i];
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter = new CLImageConverterDefault();
DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
<< x_image_shape[1];
std::vector<half_t> x_image_data(x_image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data());
VLOG(1) << "x_image_data ..... ";
for (size_t i = 0; i < x_image_data.size(); i++) {
VLOG(10) << Half2Float(x_image_data[i]);
}
DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
out_image_shape[1]);
kernel->Launch();
CLRuntime::Global()->command_queue().finish();
std::vector<float> out_data_v{0, 1, 2, 0, 1, 2, 3, 4, 5, 3, 4, 5,
6, 7, 8, 6, 7, 8, 9, 10, 11, 9, 10, 11,
12, 13, 14, 12, 13, 14, 15, 16, 17, 15, 16, 17};
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
half_t* out_image_data = new half_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
VLOG(1) << "out_image_data ..... ";
for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
VLOG(10) << Half2Float(out_image_data[i]);
}
float* out_data = new float[out_image_shape.production() * 4];
default_converter->ImageToNCHW(
out_image_data, out_data, out_image_shape, out_dim);
VLOG(1) << "out_data ..... ";
for (int i = 0; i < out_dim.production(); i++) {
VLOG(10) << out_data[i];
}
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(out_data[i] - out_data_v[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"out_ref["
<< i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
}
}
TEST(expand_c4hw_image2d, compute) {
LOG(INFO) << "create kernel ...";
auto kernels = KernelRegistry::Global().Create(
"expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
const int INPUT_N = 1;
const int INPUT_C = 4;
const int INPUT_H = 2;
const int INPUT_W = 1;
const int EXPAND_N = 1;
const int EXPAND_C = 1;
const int EXPAND_H = 2;
const int EXPAND_W = 1;
auto kernel = std::move(kernels.front());
LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
lite::Tensor x, out;
operators::ExpandParam param;
param.X = &x;
param.Out = &out;
param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(pixel_shuffle_context->As<OpenCLContext>()));
kernel->SetContext(std::move(pixel_shuffle_context));
const DDim in_dim =
DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
INPUT_C * EXPAND_C,
INPUT_H * EXPAND_H,
INPUT_W * EXPAND_W});
LOG(INFO) << "in_dim: " << in_dim;
LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
LOG(INFO) << "out_dim: " << out_dim;
x.Resize(in_dim);
out.Resize(out_dim);
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-2, 2);
std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
int index = 0;
for (auto& i : input_v) {
i = index++;
}
VLOG(1) << "input_v ..... ";
for (size_t i = 0; i < input_v.size(); i++) {
VLOG(10) << input_v[i];
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter = new CLImageConverterDefault();
DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
<< x_image_shape[1];
std::vector<half_t> x_image_data(x_image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data());
VLOG(1) << "x_image_data ..... ";
for (size_t i = 0; i < x_image_data.size(); i++) {
VLOG(10) << Half2Float(x_image_data[i]);
}
DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
out_image_shape[1]);
kernel->Launch();
CLRuntime::Global()->command_queue().finish();
std::vector<float> out_data_v{0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
half_t* out_image_data = new half_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
VLOG(1) << "out_image_data ..... ";
for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
VLOG(10) << Half2Float(out_image_data[i]);
}
float* out_data = new float[out_image_shape.production() * 4];
default_converter->ImageToNCHW(
out_image_data, out_data, out_image_shape, out_dim);
VLOG(1) << "out_data ..... ";
for (int i = 0; i < out_dim.production(); i++) {
VLOG(10) << out_data[i];
}
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(out_data[i] - out_data_v[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"out_ref["
<< i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
}
}
TEST(expand_n_image2d, compute) {
LOG(INFO) << "create kernel ...";
auto kernels = KernelRegistry::Global().Create(
"expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
const int INPUT_N = 1;
const int INPUT_C = 1;
const int INPUT_H = 2;
const int INPUT_W = 3;
const int EXPAND_N = 2;
const int EXPAND_C = 1;
const int EXPAND_H = 2;
const int EXPAND_W = 3;
auto kernel = std::move(kernels.front());
LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
lite::Tensor x, out;
operators::ExpandParam param;
param.X = &x;
param.Out = &out;
param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(pixel_shuffle_context->As<OpenCLContext>()));
kernel->SetContext(std::move(pixel_shuffle_context));
const DDim in_dim =
DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
INPUT_C * EXPAND_C,
INPUT_H * EXPAND_H,
INPUT_W * EXPAND_W});
LOG(INFO) << "in_dim: " << in_dim;
LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
LOG(INFO) << "out_dim: " << out_dim;
x.Resize(in_dim);
out.Resize(out_dim);
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-2, 2);
std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
int index = 0;
for (auto& i : input_v) {
i = index++;
}
VLOG(1) << "input_v ..... ";
for (size_t i = 0; i < input_v.size(); i++) {
VLOG(10) << input_v[i];
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter = new CLImageConverterDefault();
DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
<< x_image_shape[1];
std::vector<half_t> x_image_data(x_image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data());
VLOG(1) << "x_image_data ..... ";
for (size_t i = 0; i < x_image_data.size(); i++) {
VLOG(10) << Half2Float(x_image_data[i]);
}
DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
out_image_shape[1]);
kernel->Launch();
CLRuntime::Global()->command_queue().finish();
std::vector<float> out_data_v{
0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
5, 5, 5, 3, 3, 3, 4, 4, 4, 5, 5, 5, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0,
1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 3, 3, 3, 4, 4, 4, 5, 5, 5};
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
half_t* out_image_data = new half_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
VLOG(1) << "out_image_data ..... ";
for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
VLOG(10) << Half2Float(out_image_data[i]);
}
float* out_data = new float[out_image_shape.production() * 4];
default_converter->ImageToNCHW(
out_image_data, out_data, out_image_shape, out_dim);
VLOG(1) << "out_data ..... ";
for (int i = 0; i < out_dim.production(); i++) {
VLOG(10) << out_data[i];
}
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(out_data[i] - out_data_v[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"out_ref["
<< i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
}
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(expand, kOpenCL, kFP16, kImageDefault, image2d);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册