提交 5014c3ce 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] replace vlog with log in pool, add each run duration print in test (#3010)

* replace vlog with log in pool, add each run duration print in test. test=develop

* change layout file path. test=develop

* fix act, layout image kernel. test=develop

* fix spell error. test=develop

* remove clfinish in act, concat, layout, nearest_interp. test=develop

* add RELU macro define in fc opencl kernel. test=develop

* add cpu_ref print in activation opencl kernel.test=develop

* fix layout ut. test=develop

* replace log with vlog. test=develop

* fix get output. test=develop
上级 31858fb0
...@@ -53,9 +53,13 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -53,9 +53,13 @@ void TestModel(const std::vector<Place>& valid_places,
predictor.Run(); predictor.Run();
} }
auto start = GetCurrentUS(); double sum_duration = 0.0; // millisecond;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
auto start = GetCurrentUS();
predictor.Run(); predictor.Run();
auto duration = (GetCurrentUS() - start) / 1000.0;
sum_duration += duration;
VLOG(1) << "run_idx:" << i << " " << duration << " ms";
} }
if (save_model) { if (save_model) {
...@@ -68,8 +72,7 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -68,8 +72,7 @@ void TestModel(const std::vector<Place>& valid_places,
LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 << ", spend " << sum_duration / FLAGS_repeats << " ms in average.";
<< " ms in average.";
std::vector<std::vector<float>> ref; std::vector<std::vector<float>> ref;
ref.emplace_back(std::vector<float>( ref.emplace_back(std::vector<float>(
...@@ -115,13 +118,11 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -115,13 +118,11 @@ void TestModel(const std::vector<Place>& valid_places,
} }
// Get detailed result // Get detailed result
auto* pred = &predictor; size_t output_tensor_num = predictor.GetOutputNames().size();
size_t output_tensor_num = pred->GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num; VLOG(1) << "output tesnor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor( auto* output_tensor = predictor.GetOutput(tidx);
std::move(pred->GetOutput(tidx)));
VLOG(1) << "============= output tensor " << tidx << " =============\n"; VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims(); auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims; VLOG(1) << "out_dims:" << out_dims;
......
...@@ -54,9 +54,13 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -54,9 +54,13 @@ void TestModel(const std::vector<Place>& valid_places,
predictor.Run(); predictor.Run();
} }
auto start = GetCurrentUS(); double sum_duration = 0.0; // millisecond;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
auto start = GetCurrentUS();
predictor.Run(); predictor.Run();
auto duration = (GetCurrentUS() - start) / 1000.0;
sum_duration += duration;
VLOG(1) << "run_idx:" << i << " " << duration << " ms";
} }
if (save_model) { if (save_model) {
...@@ -69,8 +73,7 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -69,8 +73,7 @@ void TestModel(const std::vector<Place>& valid_places,
LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 << ", spend " << sum_duration / FLAGS_repeats << " ms in average.";
<< " ms in average.";
std::vector<std::vector<float>> ref; std::vector<std::vector<float>> ref;
// i = 1 // i = 1
...@@ -117,13 +120,11 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -117,13 +120,11 @@ void TestModel(const std::vector<Place>& valid_places,
} }
// Get detailed result // Get detailed result
auto* pred = &predictor; size_t output_tensor_num = predictor.GetOutputNames().size();
size_t output_tensor_num = pred->GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num; VLOG(1) << "output tesnor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const Tensor> output_tensor( auto* output_tensor = predictor.GetOutput(tidx);
std::move(pred->GetOutput(tidx)));
VLOG(1) << "============= output tensor " << tidx << " =============\n"; VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims(); auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims; VLOG(1) << "out_dims:" << out_dims;
......
...@@ -18,7 +18,7 @@ add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_ ...@@ -18,7 +18,7 @@ add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_
add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
...@@ -68,7 +68,7 @@ lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_comput ...@@ -68,7 +68,7 @@ lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_comput
DEPS elementwise_mul_opencl op_registry program context DEPS elementwise_mul_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
DEPS layout_opencl op_registry program context DEPS layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
......
...@@ -44,9 +44,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL), ...@@ -44,9 +44,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<half_t, cl::Image2D>(); auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>( auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]); image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only const auto& y_dims = param.Out->dims(); // useless: check dim only
...@@ -57,9 +57,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL), ...@@ -57,9 +57,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf); cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.X->target());
...@@ -82,9 +82,7 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL), ...@@ -82,9 +82,7 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` context.cl_wait_list()->emplace(out_img, event_);
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
} }
private: private:
...@@ -112,9 +110,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL), ...@@ -112,9 +110,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<half_t, cl::Image2D>(); auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>( auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]); image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef; auto threshold = param.Relu_clipped_coef;
...@@ -126,9 +124,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL), ...@@ -126,9 +124,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf); cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold); status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -154,9 +152,7 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL), ...@@ -154,9 +152,7 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` context.cl_wait_list()->emplace(out_img, event_);
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
} }
private: private:
...@@ -185,11 +181,11 @@ class SigmoidComputeImageDefault ...@@ -185,11 +181,11 @@ class SigmoidComputeImageDefault
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
auto* x_buf = auto* x_img =
param.X->data<half_t, param.X->data<half_t,
cl::Image2D>(); // use half_t represents half float cl::Image2D>(); // use half_t represents half float
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t
// represents half float // represents half float
image_shape["width"], image_shape["width"],
image_shape["height"]); image_shape["height"]);
...@@ -202,9 +198,9 @@ class SigmoidComputeImageDefault ...@@ -202,9 +198,9 @@ class SigmoidComputeImageDefault
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf); cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.X->target());
...@@ -227,9 +223,7 @@ class SigmoidComputeImageDefault ...@@ -227,9 +223,7 @@ class SigmoidComputeImageDefault
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` context.cl_wait_list()->emplace(out_img, event_);
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
} }
private: private:
......
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (1e0)
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -58,8 +61,8 @@ TEST(relu_image2d_fp16, compute) { ...@@ -58,8 +61,8 @@ TEST(relu_image2d_fp16, compute) {
"-> host"; "-> host";
#ifdef RELU_FP16_LOOP_TEST #ifdef RELU_FP16_LOOP_TEST
for (int n = 1; n <= 100; n += 33) { for (int n = 1; n <= 2; n += 1) {
for (auto c : {1, 3}) { for (auto c : {1}) {
for (int h = 12; h <= 100; h += 13) { for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) { for (int w = 12; w <= 100; w += 25) {
#else #else
...@@ -169,6 +172,21 @@ TEST(relu_image2d_fp16, compute) { ...@@ -169,6 +172,21 @@ TEST(relu_image2d_fp16, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel"; LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch(); img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cpu // compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref); relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result // result
...@@ -176,18 +194,24 @@ TEST(relu_image2d_fp16, compute) { ...@@ -176,18 +194,24 @@ TEST(relu_image2d_fp16, compute) {
LOG(INFO) << "---- print kernel result (input -> output) ----"; LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) { for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx] std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl; << ", ref: " << y_data_ref[eidx] << std::endl;
} }
#endif // RELU_FP16_PRINT_RESULT #endif // RELU_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref) // check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) { for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6); auto abs_diff = COMPUTE_ABS_DIFF(y_data_ref[eidx], mapped_y[eidx]);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) { auto relative_diff =
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx COMPUTE_RELATIVE_DIFF(y_data_ref[eidx], mapped_y[eidx]);
<< " / " << x_dim.production() << ", y_data_ref[" EXPECT_EQ(
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y[" (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
<< eidx << "]:" << mapped_y[eidx]; true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << eidx << ", y_data_ref[" << eidx
<< "]:" << y_data_ref[eidx] << ", mapped_y[" << eidx
<< "]:" << mapped_y[eidx] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
break; break;
} }
} }
...@@ -206,7 +230,7 @@ TEST(relu_image2d_fp16, compute) { ...@@ -206,7 +230,7 @@ TEST(relu_image2d_fp16, compute) {
#endif #endif
} }
// #define RELU6_FP16_LOOP_TEST // #define RELU6_FP16_LOOP_TEST
// #define RELU6_FP16_PRINT_RESULT // #define RELU6_FP16_PRINT_RESULT
TEST(relu6_image2d_fp16, compute) { TEST(relu6_image2d_fp16, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> " LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
...@@ -287,7 +311,7 @@ TEST(relu6_image2d_fp16, compute) { ...@@ -287,7 +311,7 @@ TEST(relu6_image2d_fp16, compute) {
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map( auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production())); y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) { for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2; mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2 * 0.1;
mapped_y[i] = static_cast<int>(0); mapped_y[i] = static_cast<int>(0);
} }
auto *relu_in_data = relu_in.mutable_data<half_t, cl::Image2D>( auto *relu_in_data = relu_in.mutable_data<half_t, cl::Image2D>(
...@@ -326,6 +350,21 @@ TEST(relu6_image2d_fp16, compute) { ...@@ -326,6 +350,21 @@ TEST(relu6_image2d_fp16, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel"; LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch(); img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cpu // compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f); relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
// result // result
...@@ -333,14 +372,14 @@ TEST(relu6_image2d_fp16, compute) { ...@@ -333,14 +372,14 @@ TEST(relu6_image2d_fp16, compute) {
LOG(INFO) << "---- print kernel result (input -> output) ----"; LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) { for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx] std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl; << ", ref: " << y_data_ref[eidx] << std::endl;
} }
#endif // RELU6_FP16_PRINT_RESULT #endif // RELU6_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref) // check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) { for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6); EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) { if (abs(y_data_ref[eidx] - mapped_y[eidx]) > FP16_MAX_DIFF) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref[" << " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y[" << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
...@@ -485,6 +524,21 @@ TEST(sigmoid_image2d_fp16, compute) { ...@@ -485,6 +524,21 @@ TEST(sigmoid_image2d_fp16, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel"; LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch(); img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cpu // compute ref cpu
sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref); sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result // result
...@@ -492,14 +546,14 @@ TEST(sigmoid_image2d_fp16, compute) { ...@@ -492,14 +546,14 @@ TEST(sigmoid_image2d_fp16, compute) {
LOG(INFO) << "---- print kernel result (input -> output) ----"; LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) { for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx] std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl; << ", ref:" << y_data_ref[eidx] << std::endl;
} }
#endif // SIGMOID_FP16_PRINT_RESULT #endif // SIGMOID_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref) // check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) { for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3); EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) { if (abs(y_data_ref[eidx] - mapped_y[eidx]) > FP16_MAX_DIFF) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref[" << " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]: " << y_data_ref[eidx] << ", mapped_y[" << eidx << "]: " << y_data_ref[eidx] << ", mapped_y["
......
...@@ -109,25 +109,28 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -109,25 +109,28 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
int arg_idx = 0; int arg_idx = 0;
int width = inputs[0]->dims()[inputs[0]->dims().size() - 1]; int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
LOG(INFO) << "concat 输入尺寸: "; VLOG(4) << "concat 输入尺寸: ";
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
LOG(INFO) << "inputs [" << i << "]" VLOG(4) << "inputs [" << i << "]"
<< "[" << inputs[i]->dims().size() << "D]:" << "[" << inputs[i]->dims().size() << "D]:"
<< " dims:" << inputs[i]->dims()[0] << " " << " dims:" << inputs[i]->dims()[0] << " "
<< inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " " << inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " "
<< inputs[i]->dims()[3]; << inputs[i]->dims()[3];
} }
LOG(INFO) << "concat 输出尺寸: ";
LOG(INFO) << " out dims: " VLOG(4) << "concat 输出尺寸: ";
<< "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1] VLOG(4) << " out dims: "
<< " " << x_dims[2] << " " << x_dims[3]; << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
LOG(INFO) << "axis_: " << axis_; << " " << x_dims[2] << " " << x_dims[3];
LOG(INFO) << "flag_: " << flag_; VLOG(4) << "axis_: " << axis_;
VLOG(4) << "flag_: " << flag_;
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]), cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["width"] / static_cast<cl::size_type>(image_shape["width"] /
x_dims[x_dims.size() - 1]), x_dims[x_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["height"])}; static_cast<cl::size_type>(image_shape["height"])};
VLOG(4) << TargetToStr(param.output->target()); VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"]; << image_shape["height"];
...@@ -136,16 +139,17 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -136,16 +139,17 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
<< "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1]; << "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
LOG(INFO) << "width_: " << width_ << ", flag_: " << flag_; VLOG(4) << "width_: " << width_ << ", flag_: " << flag_;
VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " " VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " "
<< (image_shape["width"] / x_dims[x_dims.size() - 1]) << " " << (image_shape["width"] / x_dims[x_dims.size() - 1]) << " "
<< (image_shape["height"]); << (image_shape["height"]);
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int out_w = x_dims[x_dims.size() - 1]; int out_w = x_dims[x_dims.size() - 1];
int out_c = x_dims[1]; int out_c = x_dims[1];
if (inputs.size() == 2) { if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Image2D>(); auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>();
auto* x_buf1 = inputs[1]->data<float, cl::Image2D>(); auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>();
cl_int status = kernel.setArg(arg_idx, *x_buf0); cl_int status = kernel.setArg(arg_idx, *x_buf0);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_buf1); status = kernel.setArg(++arg_idx, *x_buf1);
...@@ -171,14 +175,14 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -171,14 +175,14 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish(); context.cl_wait_list()->emplace(out_buf, event_);
} else { } else {
auto start = 0; auto start = 0;
for (int i = 0; i < inputs.size(); i++) { for (int i = 0; i < inputs.size(); i++) {
arg_idx = 0; arg_idx = 0;
auto in_dims = inputs[i]->dims(); auto in_dims = inputs[i]->dims();
image_shape = InitImageDimInfoWith(in_dims); image_shape = InitImageDimInfoWith(in_dims);
auto* x_buf = inputs[i]->data<float, cl::Image2D>(); auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
int in_w = in_dims[in_dims.size() - 1]; int in_w = in_dims[in_dims.size() - 1];
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"]; << image_shape["height"];
...@@ -212,7 +216,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -212,7 +216,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_context()->GetCommandQueue().finish(); context.cl_wait_list()->emplace(out_buf, event_);
start += inputs[i]->dims()[axis_]; start += inputs[i]->dims()[axis_];
} }
} }
......
...@@ -245,6 +245,21 @@ TEST(concat_image2d, compute) { ...@@ -245,6 +245,21 @@ TEST(concat_image2d, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel"; LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch(); img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cp_u // compute ref cp_u
std::vector<const float *> ins_ptr; std::vector<const float *> ins_ptr;
std::vector<const DDim> in_dim; std::vector<const DDim> in_dim;
......
...@@ -471,7 +471,7 @@ TEST(conv2d, compute_image2d_1x1) { ...@@ -471,7 +471,7 @@ TEST(conv2d, compute_image2d_1x1) {
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
auto relative_diff = auto relative_diff =
COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]); COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
auto abs_diff = COMPTUE_ABS_DIFF(output_v[i], out_ref_data[i]); auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
// EXPECT_LT(relative_diff, FP16_MAX_DIFF); // EXPECT_LT(relative_diff, FP16_MAX_DIFF);
EXPECT_FALSE(relative_diff > FP16_MAX_DIFF && EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
abs_diff > FP16_ABS_DIFF); abs_diff > FP16_ABS_DIFF);
...@@ -1191,7 +1191,7 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -1191,7 +1191,7 @@ TEST(conv2d, compute_image2d_5x5) {
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
auto relative_diff = auto relative_diff =
COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]); COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
auto abs_diff = COMPTUE_ABS_DIFF(output_v[i], out_ref_data[i]); auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
EXPECT_FALSE(relative_diff > FP16_MAX_DIFF && EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
abs_diff > FP16_ABS_DIFF); abs_diff > FP16_ABS_DIFF);
if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) { if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
...@@ -1540,7 +1540,7 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1540,7 +1540,7 @@ TEST(conv2d, compute_image2d_7x7) {
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
auto relative_diff = auto relative_diff =
COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]); COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
auto abs_diff = COMPTUE_ABS_DIFF(output_v[i], out_ref_data[i]); auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
EXPECT_FALSE(relative_diff > FP16_MAX_DIFF && EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
abs_diff > FP16_ABS_DIFF); abs_diff > FP16_ABS_DIFF);
if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) { if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
......
...@@ -57,6 +57,10 @@ class FcCompute ...@@ -57,6 +57,10 @@ class FcCompute
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4), global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)}; static_cast<size_t>((n_ + 3) / 4)};
} }
if (param.activation_type == "relu") {
build_options_ += "-DRELU";
}
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "buffer/fc_kernel.cl", build_options_); kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
...@@ -107,7 +111,7 @@ class FcCompute ...@@ -107,7 +111,7 @@ class FcCompute
private: private:
int m_, n_, k_; int m_, n_, k_;
std::string kernel_func_name_{}; std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE=float"}; std::string build_options_{"-DCL_DTYPE_float "};
cl::NDRange global_work_size_; cl::NDRange global_work_size_;
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -103,9 +103,6 @@ class IoCopykOpenCLToHostCompute ...@@ -103,9 +103,6 @@ class IoCopykOpenCLToHostCompute
auto* wait_list = context.cl_wait_list(); auto* wait_list = context.cl_wait_list();
auto* x_ptr = param.x->data<float, cl::Buffer>(); auto* x_ptr = param.x->data<float, cl::Buffer>();
/* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to
`cl_wait_list`
in kernel and `wait_list` enabled
auto it = wait_list->find(x_ptr); auto it = wait_list->find(x_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
...@@ -114,7 +111,6 @@ class IoCopykOpenCLToHostCompute ...@@ -114,7 +111,6 @@ class IoCopykOpenCLToHostCompute
} else { } else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor."; LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
} }
*/
CopyToHostSync(data, param.x->raw_data(), mem_size); CopyToHostSync(data, param.x->raw_data(), mem_size);
} }
......
...@@ -44,7 +44,7 @@ class LayoutComputeBufferChwToImageDefault ...@@ -44,7 +44,7 @@ class LayoutComputeBufferChwToImageDefault
} }
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "buffer/layout_kernel.cl", build_options_); kernel_func_name_, "image/layout_kernel.cl", build_options_);
} }
void Run() override { void Run() override {
...@@ -126,9 +126,7 @@ class LayoutComputeBufferChwToImageDefault ...@@ -126,9 +126,7 @@ class LayoutComputeBufferChwToImageDefault
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` context.cl_wait_list()->emplace(y_data, event_);
// context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish();
} }
std::string doc() const override { std::string doc() const override {
...@@ -155,7 +153,7 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -155,7 +153,7 @@ class LayoutComputeImageDefaultToBufferChw
} }
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "buffer/layout_kernel.cl", build_options_); kernel_func_name_, "image/layout_kernel.cl", build_options_);
} }
void Run() override { void Run() override {
...@@ -229,9 +227,7 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -229,9 +227,7 @@ class LayoutComputeImageDefaultToBufferChw
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` context.cl_wait_list()->emplace(y_data, event_);
// context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish();
} }
std::string doc() const override { std::string doc() const override {
...@@ -325,10 +321,7 @@ class LayoutComputeBufferChwToImage2DNw ...@@ -325,10 +321,7 @@ class LayoutComputeBufferChwToImage2DNw
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` context.cl_wait_list()->emplace(y_data, event_);
// context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish();
// auto image_shape = InitImageDimInfoWith(x_dims);
} }
std::string doc() const override { std::string doc() const override {
......
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h" #include "lite/kernels/opencl/image_helper.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (1e0)
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -86,7 +89,7 @@ TEST(layout_ImageDefault, compute) { ...@@ -86,7 +89,7 @@ TEST(layout_ImageDefault, compute) {
auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map( auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production())); y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) { for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<float>(i) * 2; mapped_x[i] = static_cast<float>(i) * 0.01;
} }
// set context and kernel args // set context and kernel args
...@@ -122,14 +125,19 @@ TEST(layout_ImageDefault, compute) { ...@@ -122,14 +125,19 @@ TEST(layout_ImageDefault, compute) {
#endif // PRINT_RESULT #endif // PRINT_RESULT
// check result: compare input and output // check result: compare input and output
float MAX_PASS_DIFF = 1e-4; for (int i = 0; i < x_dim.production(); i++) {
for (int eidx = 0; eidx < x_dim.production(); eidx++) { auto abs_diff = COMPUTE_ABS_DIFF(mapped_x[i], mapped_y[i]);
EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], MAX_PASS_DIFF); auto relative_diff =
if (abs(mapped_x[eidx] - mapped_y[eidx]) > MAX_PASS_DIFF) { COMPUTE_RELATIVE_DIFF(mapped_x[i], mapped_y[i]);
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx EXPECT_EQ(
<< " / " << x_dim.production() << ", mapped_x[" << eidx (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
<< "]:" << mapped_x[eidx] << ", mapped_y[" << eidx true);
<< "]:" << mapped_y[eidx]; if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " mapped_x[" << i
<< "]:" << mapped_x[i] << " mapped_y[" << i
<< "]:" << mapped_y[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
break; break;
} }
} }
...@@ -238,12 +246,27 @@ TEST(layout_ImageDefault_With_Pre_Post, compute) { ...@@ -238,12 +246,27 @@ TEST(layout_ImageDefault_With_Pre_Post, compute) {
LOG(INFO) << "run kernel: image2d_to_buffer_with_post255"; LOG(INFO) << "run kernel: image2d_to_buffer_with_post255";
img_to_buf_kernel->Launch(); img_to_buf_kernel->Launch();
// wait for opencl
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// result // result
#ifdef PRINT_RESULT #ifdef PRINT_RESULT
LOG(INFO) << "---- print result ----"; LOG(INFO) << "---- print result ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) { for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " std::cout << +mapped_x[eidx] << " -> "
<< static_cast<uint8_t>(mapped_y[eidx]) << std::endl; << +static_cast<uint8_t>(mapped_y[eidx]) << std::endl;
} }
#endif // PRINT_RESULT #endif // PRINT_RESULT
......
...@@ -46,11 +46,11 @@ class NearestInterpComputeImageDefault ...@@ -46,11 +46,11 @@ class NearestInterpComputeImageDefault
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims(); const auto& x_dims = param.X->dims();
const auto& y_dims = param.Out->dims(); const auto& y_dims = param.Out->dims();
auto* x_buf = auto* x_img =
param.X->data<half_t, param.X->data<half_t,
cl::Image2D>(); // use half_t represents half float cl::Image2D>(); // use half_t represents half float
auto out_image_shape = InitImageDimInfoWith(y_dims); auto out_image_shape = InitImageDimInfoWith(y_dims);
auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>( // use half_t
// represents half float // represents half float
out_image_shape["width"], out_image_shape["width"],
out_image_shape["height"]); out_image_shape["height"]);
...@@ -69,9 +69,9 @@ class NearestInterpComputeImageDefault ...@@ -69,9 +69,9 @@ class NearestInterpComputeImageDefault
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf); cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h)); status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -112,9 +112,7 @@ class NearestInterpComputeImageDefault ...@@ -112,9 +112,7 @@ class NearestInterpComputeImageDefault
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` context.cl_wait_list()->emplace(out_img, event_);
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
} }
private: private:
......
...@@ -208,6 +208,21 @@ TEST(nearest_interp_image2d, compute) { ...@@ -208,6 +208,21 @@ TEST(nearest_interp_image2d, compute) {
LOG(INFO) << "run kernel: img_to_buf_kernel"; LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch(); img_to_buf_kernel->Launch();
// wait for opencl
auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto &event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
}
// compute ref cpu // compute ref cpu
for (int nid = 0; nid < x_dim[0]; ++nid) { for (int nid = 0; nid < x_dim[0]; ++nid) {
for (int cid = 0; cid < x_dim[1]; ++cid) { for (int cid = 0; cid < x_dim[1]; ++cid) {
......
...@@ -69,14 +69,14 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -69,14 +69,14 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
auto* x_img = param.x->data<half_t, cl::Image2D>(); auto* x_img = param.x->data<half_t, cl::Image2D>();
LOG(INFO) << "x_image" << x_img; VLOG(4) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(out_dims); auto out_image_shape = InitImageDimInfoWith(out_dims);
LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " " VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"]; << out_image_shape["height"];
auto* out_img = param.output->mutable_data<half_t, cl::Image2D>( auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]); out_image_shape["width"], out_image_shape["height"]);
LOG(INFO) << "out_image" << out_img; VLOG(4) << "out_image" << out_img;
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
......
...@@ -63,7 +63,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL), ...@@ -63,7 +63,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
InitImageDimInfoWith(out_dims); InitImageDimInfoWith(out_dims);
cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>( cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
out_image_shape.at("width"), out_image_shape.at("height")); out_image_shape.at("width"), out_image_shape.at("height"));
LOG(INFO) << "out_dims= " << out_dims; VLOG(4) << "out_dims= " << out_dims;
const std::vector<size_t>& default_work_size = DefaultWorkSize( const std::vector<size_t>& default_work_size = DefaultWorkSize(
out_dims, out_dims,
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#pragma once #pragma once
#define COMPTUE_ABS_DIFF(res0, res1) abs(res0 - res1) #define COMPUTE_ABS_DIFF(res0, res1) abs(res0 - res1)
#define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5)) #define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册