提交 2fa2871d 编写于 作者: L liuruilong

fix build kernel error

上级 edd2c677
......@@ -488,7 +488,7 @@ static const uint8_t shifttable[512] = {
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
half_t float2half(float f) {
uint32_t v = *reinterpret_cast<uint32_t*>(&f);
uint32_t v = *reinterpret_cast<uint32_t *>(&f);
return basetable[(v >> 23) & 0x1ff] +
((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
}
......@@ -496,7 +496,7 @@ half_t float2half(float f) {
float half2float(half_t h) {
uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
exponenttable[h >> 10];
return *reinterpret_cast<float*>(&v);
return *reinterpret_cast<float *>(&v);
}
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
......
......@@ -101,7 +101,8 @@ class CLImage {
T *data() const {
if (initialized_) {
PADDLE_MOBILE_THROW_EXCEPTION(
" cl image has initialized, tensor data has been deleted, can't use tensor data");
" cl image has initialized, tensor data has been deleted, can't use "
"tensor data");
}
return reinterpret_cast<T *>(tensor_data_);
}
......@@ -194,8 +195,9 @@ class CLImage {
DLOG << " image width: " << width;
DLOG << " image height: " << height;
cl_image_ = clCreateImage2D(
context, // cl_context context
CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0), // cl_mem_flags flags
context, // cl_context context
CL_MEM_READ_WRITE |
(imageData ? CL_MEM_COPY_HOST_PTR : 0), // cl_mem_flags flags
&cf, // const cl_image_format *image_format
width, // size_t image_width
height, // size_t image_height
......@@ -223,9 +225,11 @@ class CLImage {
cl_context context_;
};
void TensorToCLImage(Tensor *tensor, CLImage *image,cl_command_queue commandQueue);
void TensorToCLImage(Tensor *tensor, CLImage *image,
cl_command_queue commandQueue);
void CLImageToTensor(CLImage *image, Tensor *tensor,cl_command_queue commandQueue);
void CLImageToTensor(CLImage *image, Tensor *tensor,
cl_command_queue commandQueue);
#ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &image);
......
......@@ -42,7 +42,8 @@ class CLScope {
auto program = Program(file_name);
DLOG << " get program ~ ";
std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
clCreateKernel(program, kernel_name.c_str(), NULL));
clCreateKernel(program, kernel_name.c_str(), &status_));
CL_CHECK_ERRORS(status_);
DLOG << " create kernel ~ ";
return std::move(kernel);
}
......@@ -60,11 +61,12 @@ class CLScope {
status_ =
clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
CL_CHECK_ERRORS(status_);
programs_[file_name] = std::move(program);
return program.get();
return programs_[file_name].get();
}
private:
......
......@@ -656,7 +656,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
if (output_tensor != nullptr) {
Executor<Dtype, P>::Ptype *output_ptr =
output_tensor->data<typename Executor<Dtype, P>::Ptype>();
output_tensor->data<typename Executor<Dtype, P>::Ptype>();
std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
for (int j = 0; j < output_tensor->numel(); ++j) {
result_vector.push_back(output_ptr[j]);
......
......@@ -60,7 +60,7 @@ void OperatorBase<Dtype>::Run() {
DLOG << " begin run " << type_;
RunImpl();
DLOG << " end run " << type_;
return;
#ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------";
vector<string> input_keys = GetInputKeys();
......@@ -103,7 +103,7 @@ void OperatorBase<Dtype>::Run() {
#ifdef PADDLE_MOBILE_CL
if (type_ == "fetch") {
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor){
if (tensor) {
DLOG << type_ << " output- " << key << "=" << tensor->dims();
}
} else {
......
......@@ -25,9 +25,9 @@ template <>
bool ConvAddBNReluKernel<GPU_CL, float>::Init(
FusionConvAddBNReluParam<GPU_CL> *param) {
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext());
param->Bias()->InitCLImage(cl_helper_.CLContext());
......
......@@ -40,18 +40,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) {
DLOG << " here1 ";
this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) {
DLOG << " here2 ";
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
DLOG << " here3 ";
this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
......@@ -64,64 +61,64 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
// DLOG << " Compute helper: " << &cl_helper_;
// DLOG << " begin compute ";
// auto kernel = this->cl_helper_.KernelAt(0);
// DLOG << " get work size ";
// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
// DLOG << " end work size ";
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// auto input = param.Input()->GetCLImage();
//
// DLOG << " get Input ";
//
// auto filter = param.Filter()->GetCLImage();
//
// DLOG << " get Filter ";
//
// auto output = param.Output();
//
// DLOG << " get Output ";
//
// int stride = param.Strides()[0];
// int offset = param.Offset();
// int input_c = param.Input()->CBlock();
// int dilation = param.Dilations()[0];
// int input_width = param.Input()->WidthOfOneBlock();
// int input_height = param.Input()->HeightOfOneBlock();
//
// cl_int status;
//
// DLOG << " begin set kernel arg ";
//
// status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
// status = clSetKernelArg(kernel, 1, sizeof(int), &w);
// status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
// status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
// status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
// status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
// status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
// status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
// status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
// status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
// status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
// status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
//
// DLOG << " end set kernel arg ";
//
// CL_CHECK_ERRORS(status);
//
// DLOG << " begin enqueue ";
//
// status =
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL);
//
// DLOG << " end enqueue ";
//
// CL_CHECK_ERRORS(status);
DLOG << " Compute helper: " << &cl_helper_;
DLOG << " begin compute ";
auto kernel = this->cl_helper_.KernelAt(0);
DLOG << " get work size ";
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
DLOG << " end work size ";
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
DLOG << " get Input ";
auto filter = param.Filter()->GetCLImage();
DLOG << " get Filter ";
auto output = param.Output();
DLOG << " get Output ";
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
cl_int status;
DLOG << " begin set kernel arg ";
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
DLOG << " end set kernel arg ";
CL_CHECK_ERRORS(status);
DLOG << " begin enqueue ";
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
DLOG << " end enqueue ";
CL_CHECK_ERRORS(status);
}
template class ConvKernel<GPU_CL, float>;
......
......@@ -34,23 +34,24 @@ int main() {
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims);
// std::vector<float>::iterator biggest =
// std::max_element(std::begin(vec_result), std::end(vec_result));
// std::cout << " Max element is " << *biggest << " at position "
// << std::distance(std::begin(vec_result), biggest) << std::endl;
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// auto time3 = paddle_mobile::time();
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// DLOG << vec_result;
// auto time4 = paddle_mobile::time();
// std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 << "ms"
// << std::endl;
// std::vector<float>::iterator biggest =
// std::max_element(std::begin(vec_result), std::end(vec_result));
// std::cout << " Max element is " << *biggest << " at position "
// << std::distance(std::begin(vec_result), biggest) <<
// std::endl;
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// auto time3 = paddle_mobile::time();
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// DLOG << vec_result;
// auto time4 = paddle_mobile::time();
// std::cout << "predict cost :" << paddle_mobile::time_diff(time3,
// time4) / 10 << "ms"
// << std::endl;
}
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册