未验证 提交 08d9efbd 编写于 作者: Y ysh329 提交者: GitHub

[OPENCL] Fix opencl fc int16 model bug caused by fc kernel (#3900) (#3914)

* fix opencl fc kernel caused int16 model weight abnormal. test=develop
上级 62baa5fb
......@@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
} else {
for (int cidx = col; cidx < N; ++cidx) {
for (int ridx = row; ridx < M; ++ridx) {
CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
CL_COMPUTE_DTYPE a0 = 0;
CL_COMPUTE_DTYPE b0 = 0;
CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0;
for (int p = 0; p < K; ++p) {
a0 = *(a + ridx * K + p);
b0 = *(b + p * N + cidx),
......
......@@ -26,7 +26,7 @@ namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
/* image kernel*/
void ConvImageCompute::PrepareForRun() {
const auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
......
......@@ -31,10 +31,27 @@ class FcCompute
public:
using param_t = operators::FcParam;
void PrepareForRun() override {}
void PrepareForRun() override {
fc_param_ = param_.get_mutable<param_t>();
auto w_t = fc_param_->w;
auto bias_t = fc_param_->bias;
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto w_gpu_data =
w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size());
TargetWrapperCL::MemcpySync(
w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD);
bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto b_gpu_data =
bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size());
TargetWrapperCL::MemcpySync(b_gpu_data,
bias_t->raw_data(),
bias_t->memory_size(),
IoDirection::HtoD);
}
void ReInitWhenNeeded() override {
fc_param_ = param_.get_mutable<param_t>();
const auto x_dims = fc_param_->input->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
......@@ -89,7 +106,7 @@ class FcCompute
}
void GetGlobalWorkSize() {
if (m_ == 1) { // gemv
if (kernel_func_name_ == "fc_gemv_1x4") { // gemv
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
......@@ -99,8 +116,8 @@ class FcCompute
void Run() override {
auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
auto* w_buf = w_gpu_t_->data<float, cl::Buffer>();
auto* bias_buf = bias_gpu_t_->data<float, cl::Buffer>();
auto* out_buf =
fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
......@@ -142,6 +159,10 @@ class FcCompute
std::string time_stamp_{GetTimeStamp()};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
std::unique_ptr<Tensor> w_gpu_t_{nullptr};
std::unique_ptr<Tensor> bias_gpu_t_{nullptr};
cl::NDRange global_work_size_;
cl::Kernel kernel_;
};
......@@ -154,7 +175,7 @@ class FcCompute
REGISTER_LITE_KERNEL(
fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
......@@ -126,9 +126,11 @@ TEST(fc, compute) {
out.Resize(out_dim);
out_ref.Resize(out_dim);
VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim;
auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto* w_data = w.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto* bias_data = bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto* w_data = w.mutable_data<float>();
auto* bias_data = bias.mutable_data<float>();
auto* out_data = out.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
std::default_random_engine engine;
......@@ -148,17 +150,15 @@ TEST(fc, compute) {
}
for (size_t i = 0; i < w_dim.production(); ++i) {
w_source[i] = static_cast<int>(dist(engine));
w_data[i] = w_source[i];
}
for (size_t i = 0; i < bias_dim.production(); ++i) {
bias_source[i] = 10; // static_cast<int>(dist(engine));
bias_data[i] = 10;
}
TargetWrapperCL::MemcpySync(
x_data, x_source.data(), x_size, IoDirection::HtoD);
TargetWrapperCL::MemcpySync(
w_data, w_source.data(), w_size, IoDirection::HtoD);
TargetWrapperCL::MemcpySync(
bias_data, bias_source.data(), bias_size, IoDirection::HtoD);
// run opencl kernel
kernel->Launch();
......@@ -186,8 +186,10 @@ TEST(fc, compute) {
#endif
std::vector<float> out_data_from_gpu(out_dim.production());
TargetWrapperCL::MemcpySync(
out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH);
TargetWrapperCL::MemcpySync(out_data_from_gpu.data(),
out_data,
out_data_from_gpu.size() * sizeof(float),
IoDirection::DtoH);
// run cpu ref
auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册