[OPENCL] Fix opencl fc int16 model bug caused by fc kernel (#3900)

* fix opencl fc kernel caused int16 model weight abnormal. test=develop

[OPENCL] Fix opencl fc int16 model bug caused by fc kernel (#3900)
* fix opencl fc kernel caused int16 model weight abnormal. test=develop
62c6d5d5 · ysh329 · GitHub · 498f147d · 62c6d5d5 · 62c6d5d5
4 changed file
--- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
@@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
    } else {
        for (int cidx = col; cidx < N; ++cidx) {
            for (int ridx = row; ridx < M; ++ridx) {
-                CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
+                CL_COMPUTE_DTYPE a0 = 0;
+                CL_COMPUTE_DTYPE b0 = 0;
+                CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0;
                for (int p = 0; p < K; ++p) {
                    a0 = *(a + ridx * K + p);
                    b0 = *(b + p * N + cidx),

--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -28,7 +28,7 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-/* image kernel*/
+
 void ConvImageCompute::PrepareForRun() {
  const auto& param = this->Param<param_t>();
  auto x_dims = param.x->dims();

--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -35,10 +35,27 @@ class FcCompute
 public:
  using param_t = operators::FcParam;

-  void PrepareForRun() override {}
+  void PrepareForRun() override {
+    fc_param_ = param_.get_mutable<param_t>();
+    auto w_t = fc_param_->w;
+    auto bias_t = fc_param_->bias;
+
+    w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
+    auto w_gpu_data =
+        w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size());
+    TargetWrapperCL::MemcpySync(
+        w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD);
+
+    bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
+    auto b_gpu_data =
+        bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size());
+    TargetWrapperCL::MemcpySync(b_gpu_data,
+                                bias_t->raw_data(),
+                                bias_t->memory_size(),
+                                IoDirection::HtoD);
+  }

  void ReInitWhenNeeded() override {
-    fc_param_ = param_.get_mutable<param_t>();
    const auto x_dims = fc_param_->input->dims();
    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
        first_epoch_for_reinit_) {
@@ -93,7 +110,7 @@ class FcCompute
  }

  void GetGlobalWorkSize() {
-    if (m_ == 1) {  // gemv
+    if (kernel_func_name_ == "fc_gemv_1x4") {  // gemv
      global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
    } else {  // gemm
      global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
@@ -103,8 +120,8 @@ class FcCompute

  void Run() override {
    auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
-    auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
-    auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
+    auto* w_buf = w_gpu_t_->data<float, cl::Buffer>();
+    auto* bias_buf = bias_gpu_t_->data<float, cl::Buffer>();
    auto* out_buf =
        fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));

@@ -154,6 +171,10 @@ class FcCompute
  std::string time_stamp_{GetTimeStamp()};
  bool first_epoch_for_reinit_{true};
  DDim last_x_dims_;
+
+  std::unique_ptr<Tensor> w_gpu_t_{nullptr};
+  std::unique_ptr<Tensor> bias_gpu_t_{nullptr};
+
  cl::NDRange global_work_size_;
  cl::Kernel kernel_;
 };
@@ -166,7 +187,7 @@ class FcCompute
 REGISTER_LITE_KERNEL(
    fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def)
    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .Finalize();
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
@@ -126,9 +126,11 @@ TEST(fc, compute) {
        out.Resize(out_dim);
        out_ref.Resize(out_dim);

+        VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim;
+
        auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* w_data = w.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* bias_data = bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        auto* w_data = w.mutable_data<float>();
+        auto* bias_data = bias.mutable_data<float>();
        auto* out_data = out.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));

        std::default_random_engine engine;
@@ -148,17 +150,15 @@ TEST(fc, compute) {
        }
        for (size_t i = 0; i < w_dim.production(); ++i) {
          w_source[i] = static_cast<int>(dist(engine));
+          w_data[i] = w_source[i];
        }
        for (size_t i = 0; i < bias_dim.production(); ++i) {
          bias_source[i] = 10;  // static_cast<int>(dist(engine));
+          bias_data[i] = 10;
        }

        TargetWrapperCL::MemcpySync(
            x_data, x_source.data(), x_size, IoDirection::HtoD);
-        TargetWrapperCL::MemcpySync(
-            w_data, w_source.data(), w_size, IoDirection::HtoD);
-        TargetWrapperCL::MemcpySync(
-            bias_data, bias_source.data(), bias_size, IoDirection::HtoD);

        // run opencl kernel
        kernel->Launch();
@@ -186,8 +186,10 @@ TEST(fc, compute) {
 #endif

        std::vector<float> out_data_from_gpu(out_dim.production());
-        TargetWrapperCL::MemcpySync(
-            out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH);
+        TargetWrapperCL::MemcpySync(out_data_from_gpu.data(),
+                                    out_data,
+                                    out_data_from_gpu.size() * sizeof(float),
+                                    IoDirection::DtoH);

        // run cpu ref
        auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));