提交 5014c3ce 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] replace vlog with log in pool, add each run duration print in test (#3010)

* replace vlog with log in pool, add each run duration print in test. test=develop

* change layout file path. test=develop

* fix act, layout image kernel. test=develop

* fix spell error. test=develop

* remove clfinish in act, concat, layout, nearest_interp. test=develop

* add RELU macro define in fc opencl kernel. test=develop

* add cpu_ref print in activation opencl kernel.test=develop

* fix layout ut. test=develop

* replace log with vlog. test=develop

* fix get output. test=develop
上级 31858fb0
......@@ -53,9 +53,13 @@ void TestModel(const std::vector<Place>& valid_places,
predictor.Run();
}
auto start = GetCurrentUS();
double sum_duration = 0.0; // millisecond;
for (int i = 0; i < FLAGS_repeats; ++i) {
auto start = GetCurrentUS();
predictor.Run();
auto duration = (GetCurrentUS() - start) / 1000.0;
sum_duration += duration;
VLOG(1) << "run_idx:" << i << " " << duration << " ms";
}
if (save_model) {
......@@ -68,8 +72,7 @@ void TestModel(const std::vector<Place>& valid_places,
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
<< ", spend " << sum_duration / FLAGS_repeats << " ms in average.";
std::vector<std::vector<float>> ref;
ref.emplace_back(std::vector<float>(
......@@ -115,13 +118,11 @@ void TestModel(const std::vector<Place>& valid_places,
}
// Get detailed result
auto* pred = &predictor;
size_t output_tensor_num = pred->GetOutputNames().size();
size_t output_tensor_num = predictor.GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor(
std::move(pred->GetOutput(tidx)));
auto* output_tensor = predictor.GetOutput(tidx);
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
......
......@@ -54,9 +54,13 @@ void TestModel(const std::vector<Place>& valid_places,
predictor.Run();
}
auto start = GetCurrentUS();
double sum_duration = 0.0; // millisecond;
for (int i = 0; i < FLAGS_repeats; ++i) {
auto start = GetCurrentUS();
predictor.Run();
auto duration = (GetCurrentUS() - start) / 1000.0;
sum_duration += duration;
VLOG(1) << "run_idx:" << i << " " << duration << " ms";
}
if (save_model) {
......@@ -69,8 +73,7 @@ void TestModel(const std::vector<Place>& valid_places,
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
<< ", spend " << sum_duration / FLAGS_repeats << " ms in average.";
std::vector<std::vector<float>> ref;
// i = 1
......@@ -117,13 +120,11 @@ void TestModel(const std::vector<Place>& valid_places,
}
// Get detailed result
auto* pred = &predictor;
size_t output_tensor_num = pred->GetOutputNames().size();
size_t output_tensor_num = predictor.GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor(
std::move(pred->GetOutput(tidx)));
auto* output_tensor = predictor.GetOutput(tidx);
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
......
......@@ -18,7 +18,7 @@ add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_
add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
......@@ -68,7 +68,7 @@ lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_comput
DEPS elementwise_mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
DEPS layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
......
......@@ -44,9 +44,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<half_t, cl::Image2D>();
auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>(
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
......@@ -57,9 +57,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
......@@ -82,9 +82,7 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
context.cl_wait_list()->emplace(out_img, event_);
}
private:
......@@ -112,9 +110,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<half_t, cl::Image2D>();
auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>(
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
......@@ -126,9 +124,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status);
......@@ -154,9 +152,7 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
context.cl_wait_list()->emplace(out_img, event_);
}
private:
......@@ -185,11 +181,11 @@ class SigmoidComputeImageDefault
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf =
auto* x_img =
param.X->data<half_t,
cl::Image2D>(); // use half_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t
// represents half float
image_shape["width"],
image_shape["height"]);
......@@ -202,9 +198,9 @@ class SigmoidComputeImageDefault
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
......@@ -227,9 +223,7 @@ class SigmoidComputeImageDefault
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
context.cl_wait_list()->emplace(out_img, event_);
}
private:
......
......@@ -18,6 +18,9 @@
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (1e0)
namespace paddle {
namespace lite {
......@@ -58,8 +61,8 @@ TEST(relu_image2d_fp16, compute) {
"-> host";
#ifdef RELU_FP16_LOOP_TEST
for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) {
for (int n = 1; n <= 2; n += 1) {
for (auto c : {1}) {
for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) {
#else
......@@ -169,6 +172,21 @@ TEST(relu_image2d_fp16, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result
......@@ -176,18 +194,24 @@ TEST(relu_image2d_fp16, compute) {
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
<< ", ref: " << y_data_ref[eidx] << std::endl;
}
#endif // RELU_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]:" << mapped_y[eidx];
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
auto abs_diff = COMPUTE_ABS_DIFF(y_data_ref[eidx], mapped_y[eidx]);
auto relative_diff =
COMPUTE_RELATIVE_DIFF(y_data_ref[eidx], mapped_y[eidx]);
EXPECT_EQ(
(relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << eidx << ", y_data_ref[" << eidx
<< "]:" << y_data_ref[eidx] << ", mapped_y[" << eidx
<< "]:" << mapped_y[eidx] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
break;
}
}
......@@ -206,7 +230,7 @@ TEST(relu_image2d_fp16, compute) {
#endif
}
// #define RELU6_FP16_LOOP_TEST
// #define RELU6_FP16_LOOP_TEST
// #define RELU6_FP16_PRINT_RESULT
TEST(relu6_image2d_fp16, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
......@@ -287,7 +311,7 @@ TEST(relu6_image2d_fp16, compute) {
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2 * 0.1;
mapped_y[i] = static_cast<int>(0);
}
auto *relu_in_data = relu_in.mutable_data<half_t, cl::Image2D>(
......@@ -326,6 +350,21 @@ TEST(relu6_image2d_fp16, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
// result
......@@ -333,14 +372,14 @@ TEST(relu6_image2d_fp16, compute) {
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
<< ", ref: " << y_data_ref[eidx] << std::endl;
}
#endif // RELU6_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > FP16_MAX_DIFF) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
......@@ -485,6 +524,21 @@ TEST(sigmoid_image2d_fp16, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cpu
sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result
......@@ -492,14 +546,14 @@ TEST(sigmoid_image2d_fp16, compute) {
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
<< ", ref:" << y_data_ref[eidx] << std::endl;
}
#endif // SIGMOID_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > FP16_MAX_DIFF) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]: " << y_data_ref[eidx] << ", mapped_y["
......
......@@ -109,25 +109,28 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
int arg_idx = 0;
int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
LOG(INFO) << "concat 输入尺寸: ";
VLOG(4) << "concat 输入尺寸: ";
for (size_t i = 0; i < inputs.size(); i++) {
LOG(INFO) << "inputs [" << i << "]"
<< "[" << inputs[i]->dims().size() << "D]:"
<< " dims:" << inputs[i]->dims()[0] << " "
<< inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " "
<< inputs[i]->dims()[3];
VLOG(4) << "inputs [" << i << "]"
<< "[" << inputs[i]->dims().size() << "D]:"
<< " dims:" << inputs[i]->dims()[0] << " "
<< inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " "
<< inputs[i]->dims()[3];
}
LOG(INFO) << "concat 输出尺寸: ";
LOG(INFO) << " out dims: "
<< "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
<< " " << x_dims[2] << " " << x_dims[3];
LOG(INFO) << "axis_: " << axis_;
LOG(INFO) << "flag_: " << flag_;
VLOG(4) << "concat 输出尺寸: ";
VLOG(4) << " out dims: "
<< "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
<< " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "axis_: " << axis_;
VLOG(4) << "flag_: " << flag_;
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["width"] /
x_dims[x_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
......@@ -136,16 +139,17 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
<< "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
LOG(INFO) << "width_: " << width_ << ", flag_: " << flag_;
VLOG(4) << "width_: " << width_ << ", flag_: " << flag_;
VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " "
<< (image_shape["width"] / x_dims[x_dims.size() - 1]) << " "
<< (image_shape["height"]);
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int out_w = x_dims[x_dims.size() - 1];
int out_c = x_dims[1];
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1);
......@@ -171,14 +175,14 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
context.cl_wait_list()->emplace(out_buf, event_);
} else {
auto start = 0;
for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0;
auto in_dims = inputs[i]->dims();
image_shape = InitImageDimInfoWith(in_dims);
auto* x_buf = inputs[i]->data<float, cl::Image2D>();
auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
int in_w = in_dims[in_dims.size() - 1];
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
......@@ -212,7 +216,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish();
context.cl_wait_list()->emplace(out_buf, event_);
start += inputs[i]->dims()[axis_];
}
}
......
......@@ -245,6 +245,21 @@ TEST(concat_image2d, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cp_u
std::vector<const float *> ins_ptr;
std::vector<const DDim> in_dim;
......
......@@ -471,7 +471,7 @@ TEST(conv2d, compute_image2d_1x1) {
for (int i = 0; i < out_dim.production(); i++) {
auto relative_diff =
COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
auto abs_diff = COMPTUE_ABS_DIFF(output_v[i], out_ref_data[i]);
auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
// EXPECT_LT(relative_diff, FP16_MAX_DIFF);
EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
abs_diff > FP16_ABS_DIFF);
......@@ -1191,7 +1191,7 @@ TEST(conv2d, compute_image2d_5x5) {
for (int i = 0; i < out_dim.production(); i++) {
auto relative_diff =
COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
auto abs_diff = COMPTUE_ABS_DIFF(output_v[i], out_ref_data[i]);
auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
abs_diff > FP16_ABS_DIFF);
if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
......@@ -1540,7 +1540,7 @@ TEST(conv2d, compute_image2d_7x7) {
for (int i = 0; i < out_dim.production(); i++) {
auto relative_diff =
COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
auto abs_diff = COMPTUE_ABS_DIFF(output_v[i], out_ref_data[i]);
auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
abs_diff > FP16_ABS_DIFF);
if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
......
......@@ -57,6 +57,10 @@ class FcCompute
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)};
}
if (param.activation_type == "relu") {
build_options_ += "-DRELU";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
......@@ -107,7 +111,7 @@ class FcCompute
private:
int m_, n_, k_;
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE=float"};
std::string build_options_{"-DCL_DTYPE_float "};
cl::NDRange global_work_size_;
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -103,9 +103,6 @@ class IoCopykOpenCLToHostCompute
auto* wait_list = context.cl_wait_list();
auto* x_ptr = param.x->data<float, cl::Buffer>();
/* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to
`cl_wait_list`
in kernel and `wait_list` enabled
auto it = wait_list->find(x_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
......@@ -114,7 +111,6 @@ class IoCopykOpenCLToHostCompute
} else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
}
*/
CopyToHostSync(data, param.x->raw_data(), mem_size);
}
......
......@@ -44,7 +44,7 @@ class LayoutComputeBufferChwToImageDefault
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
kernel_func_name_, "image/layout_kernel.cl", build_options_);
}
void Run() override {
......@@ -126,9 +126,7 @@ class LayoutComputeBufferChwToImageDefault
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish();
context.cl_wait_list()->emplace(y_data, event_);
}
std::string doc() const override {
......@@ -155,7 +153,7 @@ class LayoutComputeImageDefaultToBufferChw
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
kernel_func_name_, "image/layout_kernel.cl", build_options_);
}
void Run() override {
......@@ -229,9 +227,7 @@ class LayoutComputeImageDefaultToBufferChw
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish();
context.cl_wait_list()->emplace(y_data, event_);
}
std::string doc() const override {
......@@ -325,10 +321,7 @@ class LayoutComputeBufferChwToImage2DNw
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish();
// auto image_shape = InitImageDimInfoWith(x_dims);
context.cl_wait_list()->emplace(y_data, event_);
}
std::string doc() const override {
......
......@@ -18,6 +18,9 @@
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (1e0)
namespace paddle {
namespace lite {
......@@ -86,7 +89,7 @@ TEST(layout_ImageDefault, compute) {
auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<float>(i) * 2;
mapped_x[i] = static_cast<float>(i) * 0.01;
}
// set context and kernel args
......@@ -122,14 +125,19 @@ TEST(layout_ImageDefault, compute) {
#endif // PRINT_RESULT
// check result: compare input and output
float MAX_PASS_DIFF = 1e-4;
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], MAX_PASS_DIFF);
if (abs(mapped_x[eidx] - mapped_y[eidx]) > MAX_PASS_DIFF) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", mapped_x[" << eidx
<< "]:" << mapped_x[eidx] << ", mapped_y[" << eidx
<< "]:" << mapped_y[eidx];
for (int i = 0; i < x_dim.production(); i++) {
auto abs_diff = COMPUTE_ABS_DIFF(mapped_x[i], mapped_y[i]);
auto relative_diff =
COMPUTE_RELATIVE_DIFF(mapped_x[i], mapped_y[i]);
EXPECT_EQ(
(relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " mapped_x[" << i
<< "]:" << mapped_x[i] << " mapped_y[" << i
<< "]:" << mapped_y[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
break;
}
}
......@@ -238,12 +246,27 @@ TEST(layout_ImageDefault_With_Pre_Post, compute) {
LOG(INFO) << "run kernel: image2d_to_buffer_with_post255";
img_to_buf_kernel->Launch();
// wait for opencl
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// result
#ifdef PRINT_RESULT
LOG(INFO) << "---- print result ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> "
<< static_cast<uint8_t>(mapped_y[eidx]) << std::endl;
std::cout << +mapped_x[eidx] << " -> "
<< +static_cast<uint8_t>(mapped_y[eidx]) << std::endl;
}
#endif // PRINT_RESULT
......
......@@ -46,11 +46,11 @@ class NearestInterpComputeImageDefault
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
const auto& y_dims = param.Out->dims();
auto* x_buf =
auto* x_img =
param.X->data<half_t,
cl::Image2D>(); // use half_t represents half float
auto out_image_shape = InitImageDimInfoWith(y_dims);
auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t
// represents half float
out_image_shape["width"],
out_image_shape["height"]);
......@@ -69,9 +69,9 @@ class NearestInterpComputeImageDefault
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
CL_CHECK_FATAL(status);
......@@ -112,9 +112,7 @@ class NearestInterpComputeImageDefault
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
context.cl_wait_list()->emplace(out_img, event_);
}
private:
......
......@@ -208,6 +208,21 @@ TEST(nearest_interp_image2d, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cpu
for (int nid = 0; nid < x_dim[0]; ++nid) {
for (int cid = 0; cid < x_dim[1]; ++cid) {
......
......@@ -69,14 +69,14 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
CHECK(context.cl_context() != nullptr);
auto* x_img = param.x->data<half_t, cl::Image2D>();
LOG(INFO) << "x_image" << x_img;
VLOG(4) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(out_dims);
LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
LOG(INFO) << "out_image" << out_img;
VLOG(4) << "out_image" << out_img;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
......
......@@ -63,7 +63,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
InitImageDimInfoWith(out_dims);
cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
out_image_shape.at("width"), out_image_shape.at("height"));
LOG(INFO) << "out_dims= " << out_dims;
VLOG(4) << "out_dims= " << out_dims;
const std::vector<size_t>& default_work_size = DefaultWorkSize(
out_dims,
......
......@@ -14,7 +14,7 @@
#pragma once
#define COMPTUE_ABS_DIFF(res0, res1) abs(res0 - res1)
#define COMPUTE_ABS_DIFF(res0, res1) abs(res0 - res1)
#define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册