提交 9ef59f74 编写于 作者: Z zp7 提交者: Yanzhan Yang

[test=develop]1.fix crash when gpu op scale&elementwise_add input dim… (#1856)

* [test=develop]1.fix crash when gpu op scale&elementwise_add input dim size equal 2
2.add gpu op mul

* fix code style test=develop
上级 32065859
......@@ -41,10 +41,11 @@ bool PaddleMobilePredictor<Device, T>::Init(const PaddleMobileConfig &config) {
#endif
if (config.memory_pack.from_memory) {
DLOG << "load from memory!";
paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,
config.memory_pack.model_buf,
config.memory_pack.combined_params_size,
config.memory_pack.combined_params_buf);
paddle_mobile_->LoadCombinedMemory(
config.memory_pack.model_size, config.memory_pack.model_buf,
config.memory_pack.combined_params_size,
config.memory_pack.combined_params_buf, config.optimize,
config.quantification, config.batch_size, config.lod_mode);
} else if (!config.model_dir.empty()) {
paddle_mobile_->Load(config.model_dir, config.optimize,
config.quantification, config.batch_size,
......
......@@ -28,3 +28,24 @@ __kernel void channel_add(__global image2d_t input, __global image2d_t bias,__wr
half4 output = in + biase;
write_imageh(outputImage,coords,output);
}
__kernel void width_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t
outputImage,int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x % w;
coords_bias.y = 0;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords_bias);
half4 output;
output.x = in.x + biase.x;
output.y = in.y + biase.x;
output.z = in.z + biase.x;
output.w = in.w + biase.x;
write_imageh(outputImage,coords,output);
}
......@@ -23,21 +23,27 @@ template <>
bool ElementwiseAddKernel<GPU_CL, float>::Init(
ElementwiseAddParam<GPU_CL> *param) {
DLOG << "-----init add-----";
CLImage *bias = (CLImage *)(param->InputY());
CLImage *bias = reinterpret_cast<CLImage *>(const_cast<CLImage *>
(param->InputY()));
if (!bias->isInit()) {
bias->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
bias->InitNormalCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
}
DLOG << " bias: " << *bias;
if (bias->dims().size() == 4) {
this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
} else if (param->InputY()->dims().size() == 1) {
this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl");
if (param->Axis() == param->InputX()->dims().size() - 1) {
this->cl_helper_.AddKernel("width_add", "channel_add_kernel.cl");
} else if (param->Axis() == param->InputX()->dims().size() - 3) {
this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl");
} else {
DLOG << "error:bias dims is error";
}
} else {
DLOG << "error:bias dims is error";
}
return true;
}
......@@ -70,31 +76,37 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else if (bias->dims().size() == 1) {
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
int tensor_w = input->dims()[3];
status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
reinterpret_cast<void *>(&input_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
reinterpret_cast<void *>(&bias_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
reinterpret_cast<void *>(&output_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int),
reinterpret_cast<void *>(&tensor_w));
CL_CHECK_ERRORS(status);
int width = input->ImageWidth();
int height = input->ImageHeight();
size_t global_work_size[2] = {width, height};
cl_event out_event = param.Out()->GetClEvent();
cl_event wait_event = param.InputX()->GetClEvent();
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
if (param.Axis() == param.InputX()->dims().size() - 1 ||
param.Axis() == param.InputX()->dims().size() - 3) {
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
int tensor_w = input->dims()[input->dims().size() - 1];
status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
reinterpret_cast<void *>(&input_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
reinterpret_cast<void *>(&bias_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
reinterpret_cast<void *>(&output_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int),
reinterpret_cast<void *>(&tensor_w));
CL_CHECK_ERRORS(status);
int width = input->ImageWidth();
int height = input->ImageHeight();
DLOG << "dede:" << width << "," << height;
size_t global_work_size[2] = {width, height};
cl_event out_event = param.Out()->GetClEvent();
cl_event wait_event = param.InputX()->GetClEvent();
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else {
DLOG << "error:bias dims is error";
}
} else {
DLOG << "error:bias dims is error";
}
......
......@@ -21,11 +21,64 @@ namespace operators {
template <>
bool MulKernel<GPU_CL, float>::Init(MulParam<GPU_CL> *param) {
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
return true;
}
template <typename Dtype>
void MulCompute(const MulParam<GPU_CL> &param, cl_context context,
cl_command_queue commandQueue, cl_kernel kernel0,
cl_kernel kernel1) {
auto input_x = param.InputX();
Tensor *input_x_tensor = new Tensor();
input_x_tensor->Resize(input_x->dims());
input_x_tensor->mutable_data<float>();
framework::CLImageToTensor(input_x, input_x_tensor, context, commandQueue,
kernel0);
auto input_y = param.InputY();
Tensor input_y_tensor(input_y->data<float>(), input_y->dims());
const Tensor x_matrix =
input_x_tensor->dims().size() > 2
? framework::ReshapeToMatrix(*input_x_tensor, param.XNumColDims())
: *input_x_tensor;
const Tensor y_matrix =
input_y_tensor.dims().size() > 2
? framework::ReshapeToMatrix(input_y_tensor, param.YNumColDims())
: input_y_tensor;
auto out_dim = param.Out()->dims();
if (out_dim.size() != 2) {
param.Out()->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
auto output = param.Out();
Tensor *output_tensor = new Tensor();
output_tensor->Resize(output->dims());
output_tensor->mutable_data<float>();
math::MatMul<float, float>(x_matrix, false, y_matrix, false,
static_cast<float>(1), output_tensor,
static_cast<float>(0));
output->InitEmptyImage(context, commandQueue, output_tensor->dims());
framework::TensorToCLImage(output_tensor, output, context, commandQueue,
kernel1);
delete (input_x_tensor);
delete (output_tensor);
}
template <>
void MulKernel<GPU_CL, float>::Compute(const MulParam<GPU_CL> &param) {}
void MulKernel<GPU_CL, float>::Compute(const MulParam<GPU_CL> &param) {
auto kernel0 = this->cl_helper_.KernelAt(0);
auto kernel1 = this->cl_helper_.KernelAt(1);
MulCompute<float>(param, this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue(), kernel0, kernel1);
}
template class MulKernel<GPU_CL, float>;
......
......@@ -35,7 +35,7 @@ void ScaleKernel<GPU_CL, float>::Compute(const ScaleParam<GPU_CL>& param) {
auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
int out_width = output->dims()[3];
int out_width = (output->dims().size() == 4) ? output->dims()[3] : 1;
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
clSetKernelArg(kernel, 2, sizeof(float), &scale);
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include <memory>
#include "common/log.h"
#include "common/type_define.h"
#include "common/types.h"
......@@ -654,9 +655,9 @@ class MulParam : public OpParam {
y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
}
const GType *InputX() const { return input_x_; }
GType *InputX() const { return input_x_; }
const GType *InputY() const { return input_y_; }
GType *InputY() const { return input_y_; }
GType *Out() const { return out_; }
......
......@@ -174,6 +174,9 @@ void test(int argc, char *argv[]) {
#ifdef PADDLE_MOBILE_CL
for (auto var_name : var_names) {
auto cl_image = paddle_mobile.FetchImage(var_name);
if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) {
continue;
}
auto len = cl_image->numel();
if (len == 0) {
continue;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册