diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl index a14748c69f3eafce515c90f2b8a226703fe5883d..080ce2b457421970409431dee6841ac4f7d57bb5 100644 --- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl @@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a, } else { for (int cidx = col; cidx < N; ++cidx) { for (int ridx = row; ridx < M; ++ridx) { - CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; + CL_COMPUTE_DTYPE a0 = 0; + CL_COMPUTE_DTYPE b0 = 0; + CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0; for (int p = 0; p < K; ++p) { a0 = *(a + ridx * K + p); b0 = *(b + p * N + cidx), diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index daf732f4b6ec23d513b91fb5a2982be2066eb625..5b9e3b220a4134d466d6a4b5cbdfa2f42d12bdf6 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -28,7 +28,7 @@ namespace paddle { namespace lite { namespace kernels { namespace opencl { -/* image kernel*/ + void ConvImageCompute::PrepareForRun() { const auto& param = this->Param(); auto x_dims = param.x->dims(); diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc index 9763faf2f33f578e6f62b07a8c89390e1b80c159..3a31c8993d77388b95260ad5c0be65f791c433eb 100644 --- a/lite/kernels/opencl/fc_buffer_compute.cc +++ b/lite/kernels/opencl/fc_buffer_compute.cc @@ -35,10 +35,27 @@ class FcCompute public: using param_t = operators::FcParam; - void PrepareForRun() override {} + void PrepareForRun() override { + fc_param_ = param_.get_mutable(); + auto w_t = fc_param_->w; + auto bias_t = fc_param_->bias; + + w_gpu_t_ = std::unique_ptr(new Tensor); + auto w_gpu_data = + w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size()); + TargetWrapperCL::MemcpySync( + w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD); + + bias_gpu_t_ = std::unique_ptr(new Tensor); + auto b_gpu_data = + bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size()); + TargetWrapperCL::MemcpySync(b_gpu_data, + bias_t->raw_data(), + bias_t->memory_size(), + IoDirection::HtoD); + } void ReInitWhenNeeded() override { - fc_param_ = param_.get_mutable(); const auto x_dims = fc_param_->input->dims(); if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || first_epoch_for_reinit_) { @@ -93,7 +110,7 @@ class FcCompute } void GetGlobalWorkSize() { - if (m_ == 1) { // gemv + if (kernel_func_name_ == "fc_gemv_1x4") { // gemv global_work_size_ = cl::NDRange{static_cast((n_ + 3) / 4)}; } else { // gemm global_work_size_ = cl::NDRange{static_cast((m_ + 3) / 4), @@ -103,8 +120,8 @@ class FcCompute void Run() override { auto* x_buf = fc_param_->input->data(); - auto* w_buf = fc_param_->w->data(); - auto* bias_buf = fc_param_->bias->data(); + auto* w_buf = w_gpu_t_->data(); + auto* bias_buf = bias_gpu_t_->data(); auto* out_buf = fc_param_->output->mutable_data(TARGET(kOpenCL)); @@ -154,6 +171,10 @@ class FcCompute std::string time_stamp_{GetTimeStamp()}; bool first_epoch_for_reinit_{true}; DDim last_x_dims_; + + std::unique_ptr w_gpu_t_{nullptr}; + std::unique_ptr bias_gpu_t_{nullptr}; + cl::NDRange global_work_size_; cl::Kernel kernel_; }; @@ -166,7 +187,7 @@ class FcCompute REGISTER_LITE_KERNEL( fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))}) - .BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .Finalize(); diff --git a/lite/kernels/opencl/fc_buffer_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc index 4c9c8c47e4306c92486dd1b847884200959453dd..85793dffee9e4717e257ad8c73258ce35ad61d54 100644 --- a/lite/kernels/opencl/fc_buffer_compute_test.cc +++ b/lite/kernels/opencl/fc_buffer_compute_test.cc @@ -126,9 +126,11 @@ TEST(fc, compute) { out.Resize(out_dim); out_ref.Resize(out_dim); + VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim; + auto* x_data = x.mutable_data(TARGET(kOpenCL)); - auto* w_data = w.mutable_data(TARGET(kOpenCL)); - auto* bias_data = bias.mutable_data(TARGET(kOpenCL)); + auto* w_data = w.mutable_data(); + auto* bias_data = bias.mutable_data(); auto* out_data = out.mutable_data(TARGET(kOpenCL)); std::default_random_engine engine; @@ -148,17 +150,15 @@ TEST(fc, compute) { } for (size_t i = 0; i < w_dim.production(); ++i) { w_source[i] = static_cast(dist(engine)); + w_data[i] = w_source[i]; } for (size_t i = 0; i < bias_dim.production(); ++i) { bias_source[i] = 10; // static_cast(dist(engine)); + bias_data[i] = 10; } TargetWrapperCL::MemcpySync( x_data, x_source.data(), x_size, IoDirection::HtoD); - TargetWrapperCL::MemcpySync( - w_data, w_source.data(), w_size, IoDirection::HtoD); - TargetWrapperCL::MemcpySync( - bias_data, bias_source.data(), bias_size, IoDirection::HtoD); // run opencl kernel kernel->Launch(); @@ -186,8 +186,10 @@ TEST(fc, compute) { #endif std::vector out_data_from_gpu(out_dim.production()); - TargetWrapperCL::MemcpySync( - out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH); + TargetWrapperCL::MemcpySync(out_data_from_gpu.data(), + out_data, + out_data_from_gpu.size() * sizeof(float), + IoDirection::DtoH); // run cpu ref auto* out_ref_data = out_ref.mutable_data(TARGET(kARM));