diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
index a14748c69f3eafce515c90f2b8a226703fe5883d..080ce2b457421970409431dee6841ac4f7d57bb5 100644
--- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
@@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
     } else {
         for (int cidx = col; cidx < N; ++cidx) {
             for (int ridx = row; ridx < M; ++ridx) {
-                CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
+                CL_COMPUTE_DTYPE a0 = 0;
+                CL_COMPUTE_DTYPE b0 = 0;
+                CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0;
                 for (int p = 0; p < K; ++p) {
                     a0 = *(a + ridx * K + p);
                     b0 = *(b + p * N + cidx),
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index daf732f4b6ec23d513b91fb5a2982be2066eb625..5b9e3b220a4134d466d6a4b5cbdfa2f42d12bdf6 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -28,7 +28,7 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-/* image kernel*/
+
 void ConvImageCompute::PrepareForRun() {
   const auto& param = this->Param<param_t>();
   auto x_dims = param.x->dims();
diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc
index 9763faf2f33f578e6f62b07a8c89390e1b80c159..3a31c8993d77388b95260ad5c0be65f791c433eb 100644
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -35,10 +35,27 @@ class FcCompute
  public:
   using param_t = operators::FcParam;
 
-  void PrepareForRun() override {}
+  void PrepareForRun() override {
+    fc_param_ = param_.get_mutable<param_t>();
+    auto w_t = fc_param_->w;
+    auto bias_t = fc_param_->bias;
+
+    w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
+    auto w_gpu_data =
+        w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size());
+    TargetWrapperCL::MemcpySync(
+        w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD);
+
+    bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
+    auto b_gpu_data =
+        bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size());
+    TargetWrapperCL::MemcpySync(b_gpu_data,
+                                bias_t->raw_data(),
+                                bias_t->memory_size(),
+                                IoDirection::HtoD);
+  }
 
   void ReInitWhenNeeded() override {
-    fc_param_ = param_.get_mutable<param_t>();
     const auto x_dims = fc_param_->input->dims();
     if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
         first_epoch_for_reinit_) {
@@ -93,7 +110,7 @@ class FcCompute
   }
 
   void GetGlobalWorkSize() {
-    if (m_ == 1) {  // gemv
+    if (kernel_func_name_ == "fc_gemv_1x4") {  // gemv
       global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
     } else {  // gemm
       global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
@@ -103,8 +120,8 @@ class FcCompute
 
   void Run() override {
     auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
-    auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
-    auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
+    auto* w_buf = w_gpu_t_->data<float, cl::Buffer>();
+    auto* bias_buf = bias_gpu_t_->data<float, cl::Buffer>();
     auto* out_buf =
         fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
@@ -154,6 +171,10 @@ class FcCompute
   std::string time_stamp_{GetTimeStamp()};
   bool first_epoch_for_reinit_{true};
   DDim last_x_dims_;
+
+  std::unique_ptr<Tensor> w_gpu_t_{nullptr};
+  std::unique_ptr<Tensor> bias_gpu_t_{nullptr};
+
   cl::NDRange global_work_size_;
   cl::Kernel kernel_;
 };
@@ -166,7 +187,7 @@ class FcCompute
 REGISTER_LITE_KERNEL(
     fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .Finalize();
diff --git a/lite/kernels/opencl/fc_buffer_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc
index 4c9c8c47e4306c92486dd1b847884200959453dd..85793dffee9e4717e257ad8c73258ce35ad61d54 100644
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
@@ -126,9 +126,11 @@ TEST(fc, compute) {
         out.Resize(out_dim);
         out_ref.Resize(out_dim);
 
+        VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim;
+
         auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* w_data = w.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* bias_data = bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        auto* w_data = w.mutable_data<float>();
+        auto* bias_data = bias.mutable_data<float>();
         auto* out_data = out.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
         std::default_random_engine engine;
@@ -148,17 +150,15 @@ TEST(fc, compute) {
         }
         for (size_t i = 0; i < w_dim.production(); ++i) {
           w_source[i] = static_cast<int>(dist(engine));
+          w_data[i] = w_source[i];
         }
         for (size_t i = 0; i < bias_dim.production(); ++i) {
           bias_source[i] = 10;  // static_cast<int>(dist(engine));
+          bias_data[i] = 10;
         }
 
         TargetWrapperCL::MemcpySync(
             x_data, x_source.data(), x_size, IoDirection::HtoD);
-        TargetWrapperCL::MemcpySync(
-            w_data, w_source.data(), w_size, IoDirection::HtoD);
-        TargetWrapperCL::MemcpySync(
-            bias_data, bias_source.data(), bias_size, IoDirection::HtoD);
 
         // run opencl kernel
         kernel->Launch();
@@ -186,8 +186,10 @@ TEST(fc, compute) {
 #endif
 
         std::vector<float> out_data_from_gpu(out_dim.production());
-        TargetWrapperCL::MemcpySync(
-            out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH);
+        TargetWrapperCL::MemcpySync(out_data_from_gpu.data(),
+                                    out_data,
+                                    out_data_from_gpu.size() * sizeof(float),
+                                    IoDirection::DtoH);
 
         // run cpu ref
         auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));