提交 2fa2871d 编写于 作者: L liuruilong

fix build kernel error

上级 edd2c677
...@@ -488,7 +488,7 @@ static const uint8_t shifttable[512] = { ...@@ -488,7 +488,7 @@ static const uint8_t shifttable[512] = {
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
half_t float2half(float f) { half_t float2half(float f) {
uint32_t v = *reinterpret_cast<uint32_t*>(&f); uint32_t v = *reinterpret_cast<uint32_t *>(&f);
return basetable[(v >> 23) & 0x1ff] + return basetable[(v >> 23) & 0x1ff] +
((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
} }
...@@ -496,7 +496,7 @@ half_t float2half(float f) { ...@@ -496,7 +496,7 @@ half_t float2half(float f) {
float half2float(half_t h) { float half2float(half_t h) {
uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
exponenttable[h >> 10]; exponenttable[h >> 10];
return *reinterpret_cast<float*>(&v); return *reinterpret_cast<float *>(&v);
} }
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) { void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
......
...@@ -101,7 +101,8 @@ class CLImage { ...@@ -101,7 +101,8 @@ class CLImage {
T *data() const { T *data() const {
if (initialized_) { if (initialized_) {
PADDLE_MOBILE_THROW_EXCEPTION( PADDLE_MOBILE_THROW_EXCEPTION(
" cl image has initialized, tensor data has been deleted, can't use tensor data"); " cl image has initialized, tensor data has been deleted, can't use "
"tensor data");
} }
return reinterpret_cast<T *>(tensor_data_); return reinterpret_cast<T *>(tensor_data_);
} }
...@@ -194,8 +195,9 @@ class CLImage { ...@@ -194,8 +195,9 @@ class CLImage {
DLOG << " image width: " << width; DLOG << " image width: " << width;
DLOG << " image height: " << height; DLOG << " image height: " << height;
cl_image_ = clCreateImage2D( cl_image_ = clCreateImage2D(
context, // cl_context context context, // cl_context context
CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0), // cl_mem_flags flags CL_MEM_READ_WRITE |
(imageData ? CL_MEM_COPY_HOST_PTR : 0), // cl_mem_flags flags
&cf, // const cl_image_format *image_format &cf, // const cl_image_format *image_format
width, // size_t image_width width, // size_t image_width
height, // size_t image_height height, // size_t image_height
...@@ -223,9 +225,11 @@ class CLImage { ...@@ -223,9 +225,11 @@ class CLImage {
cl_context context_; cl_context context_;
}; };
void TensorToCLImage(Tensor *tensor, CLImage *image,cl_command_queue commandQueue); void TensorToCLImage(Tensor *tensor, CLImage *image,
cl_command_queue commandQueue);
void CLImageToTensor(CLImage *image, Tensor *tensor,cl_command_queue commandQueue); void CLImageToTensor(CLImage *image, Tensor *tensor,
cl_command_queue commandQueue);
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &image); Print &operator<<(Print &printer, const CLImage &image);
......
...@@ -42,7 +42,8 @@ class CLScope { ...@@ -42,7 +42,8 @@ class CLScope {
auto program = Program(file_name); auto program = Program(file_name);
DLOG << " get program ~ "; DLOG << " get program ~ ";
std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel( std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
clCreateKernel(program, kernel_name.c_str(), NULL)); clCreateKernel(program, kernel_name.c_str(), &status_));
CL_CHECK_ERRORS(status_);
DLOG << " create kernel ~ "; DLOG << " create kernel ~ ";
return std::move(kernel); return std::move(kernel);
} }
...@@ -60,11 +61,12 @@ class CLScope { ...@@ -60,11 +61,12 @@ class CLScope {
status_ = status_ =
clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0); clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
CL_CHECK_ERRORS(status_); CL_CHECK_ERRORS(status_);
programs_[file_name] = std::move(program); programs_[file_name] = std::move(program);
return program.get(); return programs_[file_name].get();
} }
private: private:
......
...@@ -656,7 +656,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -656,7 +656,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0); std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
if (output_tensor != nullptr) { if (output_tensor != nullptr) {
Executor<Dtype, P>::Ptype *output_ptr = Executor<Dtype, P>::Ptype *output_ptr =
output_tensor->data<typename Executor<Dtype, P>::Ptype>(); output_tensor->data<typename Executor<Dtype, P>::Ptype>();
std::vector<typename Executor<Dtype, P>::Ptype> result_vector; std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
for (int j = 0; j < output_tensor->numel(); ++j) { for (int j = 0; j < output_tensor->numel(); ++j) {
result_vector.push_back(output_ptr[j]); result_vector.push_back(output_ptr[j]);
......
...@@ -60,7 +60,7 @@ void OperatorBase<Dtype>::Run() { ...@@ -60,7 +60,7 @@ void OperatorBase<Dtype>::Run() {
DLOG << " begin run " << type_; DLOG << " begin run " << type_;
RunImpl(); RunImpl();
DLOG << " end run " << type_; DLOG << " end run " << type_;
return;
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------"; DLOG << "-------------" << type_ << "----------------------------";
vector<string> input_keys = GetInputKeys(); vector<string> input_keys = GetInputKeys();
...@@ -103,7 +103,7 @@ void OperatorBase<Dtype>::Run() { ...@@ -103,7 +103,7 @@ void OperatorBase<Dtype>::Run() {
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
if (type_ == "fetch") { if (type_ == "fetch") {
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>(); Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor){ if (tensor) {
DLOG << type_ << " output- " << key << "=" << tensor->dims(); DLOG << type_ << " output- " << key << "=" << tensor->dims();
} }
} else { } else {
......
...@@ -25,9 +25,9 @@ template <> ...@@ -25,9 +25,9 @@ template <>
bool ConvAddBNReluKernel<GPU_CL, float>::Init( bool ConvAddBNReluKernel<GPU_CL, float>::Init(
FusionConvAddBNReluParam<GPU_CL> *param) { FusionConvAddBNReluParam<GPU_CL> *param) {
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
"need equal"); "need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext()); param->Filter()->InitCLImage(cl_helper_.CLContext());
param->Bias()->InitCLImage(cl_helper_.CLContext()); param->Bias()->InitCLImage(cl_helper_.CLContext());
......
...@@ -40,18 +40,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -40,18 +40,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
if (param->Filter()->WidthOfOneBlock() == 1 && if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) { param->Filter()->HeightOfOneBlock() == 1) {
DLOG << " here1 "; DLOG << " here1 ";
this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl"); this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) { } else if (param->Filter()->dims()[1] == 1) {
DLOG << " here2 "; DLOG << " here2 ";
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_kernel.cl"); this->cl_helper_.AddKernel("depth_conv_3x3", "conv_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 && } else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) { param->Filter()->HeightOfOneBlock() == 3) {
DLOG << " here3 "; DLOG << " here3 ";
this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl"); this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
...@@ -64,64 +61,64 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -64,64 +61,64 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
template <> template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
// DLOG << " Compute helper: " << &cl_helper_; DLOG << " Compute helper: " << &cl_helper_;
// DLOG << " begin compute "; DLOG << " begin compute ";
// auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
// DLOG << " get work size "; DLOG << " get work size ";
// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
// DLOG << " end work size "; DLOG << " end work size ";
// int c_block = default_work_size[0]; int c_block = default_work_size[0];
// int w = default_work_size[1]; int w = default_work_size[1];
// int nh = default_work_size[2]; int nh = default_work_size[2];
// auto input = param.Input()->GetCLImage(); auto input = param.Input()->GetCLImage();
//
// DLOG << " get Input "; DLOG << " get Input ";
//
// auto filter = param.Filter()->GetCLImage(); auto filter = param.Filter()->GetCLImage();
//
// DLOG << " get Filter "; DLOG << " get Filter ";
//
// auto output = param.Output(); auto output = param.Output();
//
// DLOG << " get Output "; DLOG << " get Output ";
//
// int stride = param.Strides()[0]; int stride = param.Strides()[0];
// int offset = param.Offset(); int offset = param.Offset();
// int input_c = param.Input()->CBlock(); int input_c = param.Input()->CBlock();
// int dilation = param.Dilations()[0]; int dilation = param.Dilations()[0];
// int input_width = param.Input()->WidthOfOneBlock(); int input_width = param.Input()->WidthOfOneBlock();
// int input_height = param.Input()->HeightOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock();
//
// cl_int status; cl_int status;
//
// DLOG << " begin set kernel arg "; DLOG << " begin set kernel arg ";
//
// status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
// status = clSetKernelArg(kernel, 1, sizeof(int), &w); status = clSetKernelArg(kernel, 1, sizeof(int), &w);
// status = clSetKernelArg(kernel, 2, sizeof(int), &nh); status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
// status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
// status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
// status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
// status = clSetKernelArg(kernel, 6, sizeof(int), &stride); status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
// status = clSetKernelArg(kernel, 7, sizeof(int), &offset); status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
// status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
// status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
// status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
// status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
//
// DLOG << " end set kernel arg "; DLOG << " end set kernel arg ";
//
// CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
//
// DLOG << " begin enqueue "; DLOG << " begin enqueue ";
//
// status = status =
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 0, NULL, NULL);
//
// DLOG << " end enqueue "; DLOG << " end enqueue ";
//
// CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} }
template class ConvKernel<GPU_CL, float>; template class ConvKernel<GPU_CL, float>;
......
...@@ -34,23 +34,24 @@ int main() { ...@@ -34,23 +34,24 @@ int main() {
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims); GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims); auto vec_result = paddle_mobile.Predict(input, dims);
// std::vector<float>::iterator biggest = // std::vector<float>::iterator biggest =
// std::max_element(std::begin(vec_result), std::end(vec_result)); // std::max_element(std::begin(vec_result), std::end(vec_result));
// std::cout << " Max element is " << *biggest << " at position " // std::cout << " Max element is " << *biggest << " at position "
// << std::distance(std::begin(vec_result), biggest) << std::endl; // << std::distance(std::begin(vec_result), biggest) <<
// std::endl;
// for (int i = 0; i < 10; ++i) { // for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims); // auto vec_result = paddle_mobile.Predict(input, dims);
// } // }
// auto time3 = paddle_mobile::time(); // auto time3 = paddle_mobile::time();
// for (int i = 0; i < 10; ++i) { // for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims); // auto vec_result = paddle_mobile.Predict(input, dims);
// } // }
// DLOG << vec_result; // DLOG << vec_result;
// auto time4 = paddle_mobile::time(); // auto time4 = paddle_mobile::time();
// std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 << "ms" // std::cout << "predict cost :" << paddle_mobile::time_diff(time3,
// << std::endl; // time4) / 10 << "ms"
// << std::endl;
} }
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册