未验证 提交 08d9efbd 编写于 作者: Y ysh329 提交者: GitHub

[OPENCL] Fix opencl fc int16 model bug caused by fc kernel (#3900) (#3914)

* fix opencl fc kernel caused int16 model weight abnormal. test=develop
上级 62baa5fb
...@@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a, ...@@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
} else { } else {
for (int cidx = col; cidx < N; ++cidx) { for (int cidx = col; cidx < N; ++cidx) {
for (int ridx = row; ridx < M; ++ridx) { for (int ridx = row; ridx < M; ++ridx) {
CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; CL_COMPUTE_DTYPE a0 = 0;
CL_COMPUTE_DTYPE b0 = 0;
CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0;
for (int p = 0; p < K; ++p) { for (int p = 0; p < K; ++p) {
a0 = *(a + ridx * K + p); a0 = *(a + ridx * K + p);
b0 = *(b + p * N + cidx), b0 = *(b + p * N + cidx),
......
...@@ -26,7 +26,7 @@ namespace paddle { ...@@ -26,7 +26,7 @@ namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
/* image kernel*/
void ConvImageCompute::PrepareForRun() { void ConvImageCompute::PrepareForRun() {
const auto& param = this->Param<param_t>(); const auto& param = this->Param<param_t>();
auto x_dims = param.x->dims(); auto x_dims = param.x->dims();
......
...@@ -31,10 +31,27 @@ class FcCompute ...@@ -31,10 +31,27 @@ class FcCompute
public: public:
using param_t = operators::FcParam; using param_t = operators::FcParam;
void PrepareForRun() override {} void PrepareForRun() override {
fc_param_ = param_.get_mutable<param_t>();
auto w_t = fc_param_->w;
auto bias_t = fc_param_->bias;
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto w_gpu_data =
w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size());
TargetWrapperCL::MemcpySync(
w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD);
bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto b_gpu_data =
bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size());
TargetWrapperCL::MemcpySync(b_gpu_data,
bias_t->raw_data(),
bias_t->memory_size(),
IoDirection::HtoD);
}
void ReInitWhenNeeded() override { void ReInitWhenNeeded() override {
fc_param_ = param_.get_mutable<param_t>();
const auto x_dims = fc_param_->input->dims(); const auto x_dims = fc_param_->input->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) { first_epoch_for_reinit_) {
...@@ -89,7 +106,7 @@ class FcCompute ...@@ -89,7 +106,7 @@ class FcCompute
} }
void GetGlobalWorkSize() { void GetGlobalWorkSize() {
if (m_ == 1) { // gemv if (kernel_func_name_ == "fc_gemv_1x4") { // gemv
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)}; global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm } else { // gemm
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4), global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
...@@ -99,8 +116,8 @@ class FcCompute ...@@ -99,8 +116,8 @@ class FcCompute
void Run() override { void Run() override {
auto* x_buf = fc_param_->input->data<float, cl::Buffer>(); auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
auto* w_buf = fc_param_->w->data<float, cl::Buffer>(); auto* w_buf = w_gpu_t_->data<float, cl::Buffer>();
auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>(); auto* bias_buf = bias_gpu_t_->data<float, cl::Buffer>();
auto* out_buf = auto* out_buf =
fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
...@@ -142,6 +159,10 @@ class FcCompute ...@@ -142,6 +159,10 @@ class FcCompute
std::string time_stamp_{GetTimeStamp()}; std::string time_stamp_{GetTimeStamp()};
bool first_epoch_for_reinit_{true}; bool first_epoch_for_reinit_{true};
DDim last_x_dims_; DDim last_x_dims_;
std::unique_ptr<Tensor> w_gpu_t_{nullptr};
std::unique_ptr<Tensor> bias_gpu_t_{nullptr};
cl::NDRange global_work_size_; cl::NDRange global_work_size_;
cl::Kernel kernel_; cl::Kernel kernel_;
}; };
...@@ -154,7 +175,7 @@ class FcCompute ...@@ -154,7 +175,7 @@ class FcCompute
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def) fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize(); .Finalize();
...@@ -126,9 +126,11 @@ TEST(fc, compute) { ...@@ -126,9 +126,11 @@ TEST(fc, compute) {
out.Resize(out_dim); out.Resize(out_dim);
out_ref.Resize(out_dim); out_ref.Resize(out_dim);
VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim;
auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto* w_data = w.mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); auto* w_data = w.mutable_data<float>();
auto* bias_data = bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); auto* bias_data = bias.mutable_data<float>();
auto* out_data = out.mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); auto* out_data = out.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine; std::default_random_engine engine;
...@@ -148,17 +150,15 @@ TEST(fc, compute) { ...@@ -148,17 +150,15 @@ TEST(fc, compute) {
} }
for (size_t i = 0; i < w_dim.production(); ++i) { for (size_t i = 0; i < w_dim.production(); ++i) {
w_source[i] = static_cast<int>(dist(engine)); w_source[i] = static_cast<int>(dist(engine));
w_data[i] = w_source[i];
} }
for (size_t i = 0; i < bias_dim.production(); ++i) { for (size_t i = 0; i < bias_dim.production(); ++i) {
bias_source[i] = 10; // static_cast<int>(dist(engine)); bias_source[i] = 10; // static_cast<int>(dist(engine));
bias_data[i] = 10;
} }
TargetWrapperCL::MemcpySync( TargetWrapperCL::MemcpySync(
x_data, x_source.data(), x_size, IoDirection::HtoD); x_data, x_source.data(), x_size, IoDirection::HtoD);
TargetWrapperCL::MemcpySync(
w_data, w_source.data(), w_size, IoDirection::HtoD);
TargetWrapperCL::MemcpySync(
bias_data, bias_source.data(), bias_size, IoDirection::HtoD);
// run opencl kernel // run opencl kernel
kernel->Launch(); kernel->Launch();
...@@ -186,8 +186,10 @@ TEST(fc, compute) { ...@@ -186,8 +186,10 @@ TEST(fc, compute) {
#endif #endif
std::vector<float> out_data_from_gpu(out_dim.production()); std::vector<float> out_data_from_gpu(out_dim.production());
TargetWrapperCL::MemcpySync( TargetWrapperCL::MemcpySync(out_data_from_gpu.data(),
out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH); out_data,
out_data_from_gpu.size() * sizeof(float),
IoDirection::DtoH);
// run cpu ref // run cpu ref
auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM)); auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册