From 62c6d5d513a6974615ce5a2924bfeeb70dd90a86 Mon Sep 17 00:00:00 2001 From: ysh329 Date: Wed, 8 Jul 2020 15:06:50 +0800 Subject: [PATCH] [OPENCL] Fix opencl fc int16 model bug caused by fc kernel (#3900) * fix opencl fc kernel caused int16 model weight abnormal. test=develop --- .../opencl/cl_kernel/buffer/fc_kernel.cl | 4 ++- lite/kernels/opencl/conv_image_compute.cc | 2 +- lite/kernels/opencl/fc_buffer_compute.cc | 35 +++++++++++++++---- lite/kernels/opencl/fc_buffer_compute_test.cc | 18 +++++----- 4 files changed, 42 insertions(+), 17 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl index a14748c69f..080ce2b457 100644 --- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl @@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a, } else { for (int cidx = col; cidx < N; ++cidx) { for (int ridx = row; ridx < M; ++ridx) { - CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; + CL_COMPUTE_DTYPE a0 = 0; + CL_COMPUTE_DTYPE b0 = 0; + CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0; for (int p = 0; p < K; ++p) { a0 = *(a + ridx * K + p); b0 = *(b + p * N + cidx), diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index daf732f4b6..5b9e3b220a 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -28,7 +28,7 @@ namespace paddle { namespace lite { namespace kernels { namespace opencl { -/* image kernel*/ + void ConvImageCompute::PrepareForRun() { const auto& param = this->Param(); auto x_dims = param.x->dims(); diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc index 9763faf2f3..3a31c8993d 100644 --- a/lite/kernels/opencl/fc_buffer_compute.cc +++ b/lite/kernels/opencl/fc_buffer_compute.cc @@ -35,10 +35,27 @@ class FcCompute public: using param_t = operators::FcParam; - void PrepareForRun() override {} + void PrepareForRun() override { + fc_param_ = param_.get_mutable(); + auto w_t = fc_param_->w; + auto bias_t = fc_param_->bias; + + w_gpu_t_ = std::unique_ptr(new Tensor); + auto w_gpu_data = + w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size()); + TargetWrapperCL::MemcpySync( + w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD); + + bias_gpu_t_ = std::unique_ptr(new Tensor); + auto b_gpu_data = + bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size()); + TargetWrapperCL::MemcpySync(b_gpu_data, + bias_t->raw_data(), + bias_t->memory_size(), + IoDirection::HtoD); + } void ReInitWhenNeeded() override { - fc_param_ = param_.get_mutable(); const auto x_dims = fc_param_->input->dims(); if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || first_epoch_for_reinit_) { @@ -93,7 +110,7 @@ class FcCompute } void GetGlobalWorkSize() { - if (m_ == 1) { // gemv + if (kernel_func_name_ == "fc_gemv_1x4") { // gemv global_work_size_ = cl::NDRange{static_cast((n_ + 3) / 4)}; } else { // gemm global_work_size_ = cl::NDRange{static_cast((m_ + 3) / 4), @@ -103,8 +120,8 @@ class FcCompute void Run() override { auto* x_buf = fc_param_->input->data(); - auto* w_buf = fc_param_->w->data(); - auto* bias_buf = fc_param_->bias->data(); + auto* w_buf = w_gpu_t_->data(); + auto* bias_buf = bias_gpu_t_->data(); auto* out_buf = fc_param_->output->mutable_data(TARGET(kOpenCL)); @@ -154,6 +171,10 @@ class FcCompute std::string time_stamp_{GetTimeStamp()}; bool first_epoch_for_reinit_{true}; DDim last_x_dims_; + + std::unique_ptr w_gpu_t_{nullptr}; + std::unique_ptr bias_gpu_t_{nullptr}; + cl::NDRange global_work_size_; cl::Kernel kernel_; }; @@ -166,7 +187,7 @@ class FcCompute REGISTER_LITE_KERNEL( fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))}) - .BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .Finalize(); diff --git a/lite/kernels/opencl/fc_buffer_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc index 4c9c8c47e4..85793dffee 100644 --- a/lite/kernels/opencl/fc_buffer_compute_test.cc +++ b/lite/kernels/opencl/fc_buffer_compute_test.cc @@ -126,9 +126,11 @@ TEST(fc, compute) { out.Resize(out_dim); out_ref.Resize(out_dim); + VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim; + auto* x_data = x.mutable_data(TARGET(kOpenCL)); - auto* w_data = w.mutable_data(TARGET(kOpenCL)); - auto* bias_data = bias.mutable_data(TARGET(kOpenCL)); + auto* w_data = w.mutable_data(); + auto* bias_data = bias.mutable_data(); auto* out_data = out.mutable_data(TARGET(kOpenCL)); std::default_random_engine engine; @@ -148,17 +150,15 @@ TEST(fc, compute) { } for (size_t i = 0; i < w_dim.production(); ++i) { w_source[i] = static_cast(dist(engine)); + w_data[i] = w_source[i]; } for (size_t i = 0; i < bias_dim.production(); ++i) { bias_source[i] = 10; // static_cast(dist(engine)); + bias_data[i] = 10; } TargetWrapperCL::MemcpySync( x_data, x_source.data(), x_size, IoDirection::HtoD); - TargetWrapperCL::MemcpySync( - w_data, w_source.data(), w_size, IoDirection::HtoD); - TargetWrapperCL::MemcpySync( - bias_data, bias_source.data(), bias_size, IoDirection::HtoD); // run opencl kernel kernel->Launch(); @@ -186,8 +186,10 @@ TEST(fc, compute) { #endif std::vector out_data_from_gpu(out_dim.production()); - TargetWrapperCL::MemcpySync( - out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH); + TargetWrapperCL::MemcpySync(out_data_from_gpu.data(), + out_data, + out_data_from_gpu.size() * sizeof(float), + IoDirection::DtoH); // run cpu ref auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); -- GitLab