未验证 提交 3f5ea092 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Fix opencl miscs (#3438)

* fix bilinear opencl kernel. test=develop

* [LITE][OPENCL] replace map with memsync. test=develop

* [GIT] Ehance ignore. test=develop

* replace map of Fc with MemCpySync. test=develop

* comment for mul of opencl. test=develop

* Fix opencl miscs. test=develop
上级 b92734f1
......@@ -63,6 +63,16 @@ test/models/
test/images/
*.pyc
# model
*.nb
*.svg
*.dot
# vim intermediate files
*.swp
# Emacs intermediate files
*~
......
......@@ -111,7 +111,8 @@ lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
# NOTE(ysh329): use fc as `mul`, and mul is not used.
#add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(fusion_elementwise_add_activation_opencl
# OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
......@@ -147,8 +148,8 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
DEPS fc_opencl op_registry program context)
lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
DEPS mul_opencl op_registry program context)
#lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
# DEPS mul_opencl op_registry program context)
#lite_cc_test(test_elementwise_add_buffer_opencl SRCS elementwise_add__buffer_compute_test.cc
# DEPS elementwise_add_opencl op_registry program context)
......
......@@ -176,7 +176,6 @@ TEST(bilinear_interp_image2d, compute) {
input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data());
// LOG(INFO) << "x_image:" << x_image;
DDim out_image_shape =
default_converter->InitImageDimInfoWith(out_dim);
......@@ -184,9 +183,8 @@ TEST(bilinear_interp_image2d, compute) {
<< out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>(
out_image_shape[0], out_image_shape[1]);
// LOG(INFO) << "out_image:" << out_image;
kernel->Launch();
kernel->Launch();
CLRuntime::Global()->command_queue().finish();
std::unique_ptr<float[]> out_ref(
......
......@@ -41,9 +41,8 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
boxcoder_param_->box_normalized == true) {
kernel_func_name_ = "decode_center_size";
} else {
printf("This code_type %s doesn't support \n",
boxcoder_param_->code_type.c_str());
return;
LOG(FATAL) << "This code_type " << boxcoder_param_->code_type
<< " doesn't support";
}
CHECK(context.cl_context() != nullptr);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
......
......@@ -162,15 +162,27 @@ TEST(fc, compute) {
// run opencl kernel
kernel->Launch();
CLRuntime::Global()->command_queue().finish();
#if 0 // NOTE(ysh329): note event
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Buffer>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto& event = *(it->second);
event.wait();
CLRuntime::Global()->command_queue().finish();
#if 0
double start_nanos =
event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
double stop_nanos =
event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
} else {
LOG(FATAL)
<< "Could not find the sync event for the target cl tensor.";
}
#endif
std::vector<float> out_data_from_gpu(out_dim.production());
......@@ -201,18 +213,17 @@ TEST(fc, compute) {
out_data_from_gpu.data()[eidx]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(
out_ref_data[eidx], out_data_from_gpu.data()[eidx]);
// EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
// (abs_diff <= FP16_MAX_DIFF),
// true);
EXPECT_EQ(
(relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << eidx << ", out_ref_data[" << eidx
LOG(FATAL) << "error idx:" << eidx << ", out_ref_data[" << eidx
<< "]:" << out_ref_data[eidx]
<< ", out_data_from_gpu.data()[" << eidx
<< "]:" << out_data_from_gpu.data()[eidx]
<< " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
return;
}
}
......
......@@ -118,8 +118,11 @@ class LayoutComputeBufferChwToImageDefault
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
CL_CHECK_FATAL(status);
#ifndef LITE_SHUTDOWN_LOG
VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
<< " " << (new_dims[0] * new_dims[2]);
#endif
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
static_cast<cl::size_type>(new_dims[3]),
......
......@@ -84,7 +84,8 @@ TEST(slice_image2d_fp16, compute) {
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter = new CLImageConverterDefault();
std::shared_ptr<CLImageConverterDefault> default_converter(
new CLImageConverterDefault());
DDim image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1];
std::vector<half_t> x_image_data(image_shape.production() * 4); // 4 : RGBA
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册