From 285066f53735ecaca93f1ed2013082c09c91341f Mon Sep 17 00:00:00 2001 From: liuruilong Date: Sun, 14 Oct 2018 18:59:18 +0800 Subject: [PATCH] commit cl code --- src/common/common.h | 4 + src/framework/cl/cl_engine.h | 2 +- src/framework/cl/cl_image.cpp | 201 +++++++++--------- src/framework/cl/cl_image.h | 128 +++++++---- src/framework/cl/cl_scope.h | 3 +- src/framework/cl/cl_tool.h | 13 +- src/framework/executor.cpp | 13 +- src/operators/feed_op.h | 4 +- src/operators/kernel/cl/cl_kernel/cl_common.h | 6 +- .../kernel/cl/conv_add_bn_relu_kernel.cpp | 55 ++--- src/operators/kernel/cl/conv_add_kernel.cpp | 43 ++-- src/operators/kernel/cl/conv_kernel.cpp | 108 +++++----- .../kernel/cl/depthwise_conv_kernel.cpp | 50 +++-- src/operators/kernel/cl/feed_kernel.cpp | 71 ++++--- src/operators/kernel/cl/relu_kernel.cpp | 8 +- src/operators/kernel/cl/reshape_kernel.cpp | 25 ++- src/operators/kernel/cl/softmax_kernel.cpp | 12 +- src/operators/kernel/feed_kernel.h | 20 +- test/net/test_googlenet.cpp | 4 +- test/net/test_mobilenet_GPU.cpp | 12 +- 20 files changed, 436 insertions(+), 346 deletions(-) diff --git a/src/common/common.h b/src/common/common.h index 12157b5e94..c59e6b7932 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include +namespace paddle_mobile { + using Time = decltype(std::chrono::high_resolution_clock::now()); inline Time time() { return std::chrono::high_resolution_clock::now(); } @@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) { ms counter = std::chrono::duration_cast(diff); return counter.count() / 1000.0; } + +} diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h index dc5e8aa60e..ee671a1ff2 100644 --- a/src/framework/cl/cl_engine.h +++ b/src/framework/cl/cl_engine.h @@ -18,8 +18,8 @@ limitations under the License. */ #include #include "CL/cl.h" -#include "common/log.h" #include "common/enforce.h" +#include "common/log.h" #include "framework/cl/cl_deleter.h" #include "framework/cl/cl_tool.h" diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp index 94d5bb8602..e59166df4c 100644 --- a/src/framework/cl/cl_image.cpp +++ b/src/framework/cl/cl_image.cpp @@ -14,110 +14,107 @@ limitations under the License. */ #include "cl_image.h" namespace paddle_mobile { - namespace framework { - void CLImageToTensor(CLImage *cl_image, Tensor *tensor,cl_command_queue commandQueue){ - - DDim ddim = cl_image->dims(); - size_t N,C,H,W; - if(ddim.size()==4){ - N = ddim[0]; - if(N<0){ - N = 1; - } - C = ddim[1]; - H = ddim[2]; - W = ddim[3]; - }else if(ddim.size()==1){ - N = 1; - C = ddim[0]; - H = 1; - W = 1; - } - - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - float *p = tensor->data(); - half imageData[width * height * 4]; - cl_int err; - cl_mem image = cl_image->GetCLImage(); - size_t origin[3] = {0,0,0}; - size_t region[3] = {width,height,1}; - err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL); - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - size_t i1 = i0; - for (int h = 0; h < H; h++) { - size_t i2 = (i1<<2) + c % 4; - for (int w = 0; w < W; w++) { - *p = half2float(imageData[i2]); - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } - - - if (err != CL_SUCCESS) { - // TODO: error handling - } - +namespace framework { +void CLImageToTensor(CLImage *cl_image, Tensor *tensor, + cl_command_queue commandQueue) { + DDim ddim = cl_image->dims(); + size_t N, C, H, W; + if (ddim.size() == 4) { + N = ddim[0]; + if (N < 0) { + N = 1; + } + C = ddim[1]; + H = ddim[2]; + W = ddim[3]; + } else if (ddim.size() == 1) { + N = 1; + C = ddim[0]; + H = 1; + W = 1; + } + + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + float *p = tensor->data(); + half imageData[width * height * 4]; + cl_int err; + cl_mem image = cl_image->GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0, + imageData, 0, NULL, NULL); + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = half2float(imageData[i2]); + i2 += 4; + p++; } - void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,cl_command_queue commandQueue){ - - DDim ddim = cl_image->dims(); - size_t N,C,H,W; - if(ddim.size()==4){ - N = ddim[0]; - if(N<0){ - N = 1; - } - C = ddim[1]; - H = ddim[2]; - W = ddim[3]; - }else if(ddim.size()==1){ - N = 1; - C = ddim[0]; - H = 1; - W = 1; - } - - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - const float *p = tensor->data(); - half imageData[width * height * 4]; - cl_mem image = cl_image->GetCLImage(); - size_t origin[3] = {0,0,0}; - size_t region[3] = {width,height,1}; - cl_int err; - err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL); - if (err != CL_SUCCESS) { - // TODO: error handling - } - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - size_t i1 = i0; - for (int h = 0; h < H; h++) { - size_t i2 = (i1<<2) + c % 4; - for (int w = 0; w < W; w++) { - imageData[i2] = float2half(*p); - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } - + i1 += width; + } + } + i0 += width * H; + } + if (err != CL_SUCCESS) { + // TODO: error handling + } +} +void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, + cl_command_queue commandQueue) { + DDim ddim = cl_image->dims(); + size_t N, C, H, W; + if (ddim.size() == 4) { + N = ddim[0]; + if (N < 0) { + N = 1; + } + C = ddim[1]; + H = ddim[2]; + W = ddim[3]; + } else if (ddim.size() == 1) { + N = 1; + C = ddim[0]; + H = 1; + W = 1; + } + + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + const float *p = tensor->data(); + half imageData[width * height * 4]; + cl_mem image = cl_image->GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + cl_int err; + err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0, + imageData, 0, NULL, NULL); + if (err != CL_SUCCESS) { + // TODO: error handling + } + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + imageData[i2] = float2half(*p); + i2 += 4; + p++; } + i1 += width; + } } + i0 += width * H; + } } - +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index ad86713f90..5943ecc9c7 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -28,8 +28,93 @@ class CLImage { public: CLImage() = default; - void Init(cl_context context, float *tensorInput, DDim ddim) { - tensor_dims_ = ddim; + /* + * will not hold input tensor data, memcpy in this method + * */ + void SetTensorData(float *tensorData, const DDim &dim) { + int numel = product(dim); + if (tensor_data_ != nullptr) { + delete[](tensor_data_); + } + tensor_data_ = new float[numel]; + memcpy(tensor_data_, tensorData, numel); + tensor_dims_ = dim; + } + + /* + * need call SetTensorData first + * */ + void InitCLImage(cl_context context) { + if (tensor_data_ == nullptr) { + PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); + } + InitCLImage(context, tensor_data_, tensor_dims_); + delete[](tensor_data_); + tensor_data_ = nullptr; + initialized_ = true; + } + + void InitEmptyImage(cl_context context, const DDim &dim) { + if (tensor_data_ != nullptr) { + PADDLE_MOBILE_THROW_EXCEPTION( + " empty image tensor data shouldn't have value"); + } + InitCLImage(context, nullptr, dim); + initialized_ = true; + } + + cl_mem GetCLImage() const { return cl_image_; } + + const DDim &ImageDims() { return image_dims_; } + + inline size_t ImageWidth() const { return image_width_; } + + inline size_t ImageHeight() const { return image_height_; } + + /* + * block of channels, 4 channel one block + * */ + inline size_t CBlock() const { return c_block_; } + + /* + * width of original tensor + * */ + inline size_t WidthOfOneBlock() const { return width_of_one_block_; } + + /* + * height of original tensor + * */ + inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + + /* + * resize original tensor dim + * */ + inline CLImage &Resize(const DDim &dims) { + tensor_dims_ = dims; + return *this; + } + + template + T *data() const { + if (initialized_) { + PADDLE_MOBILE_THROW_EXCEPTION( + " cl image has initialized, tensor data has been deleted "); + } + return reinterpret_cast(tensor_data_); + } + + /* + * numel of tensor dim + * */ + inline int64_t numel() const { return product(tensor_dims_); } + + /* + * original tensor dim + * */ + const DDim &dims() const { return tensor_dims_; } + + private: + void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) { cl_image_format cf = {.image_channel_order = CL_RGBA, .image_channel_data_type = CL_HALF_FLOAT}; // NCHW -> [W * (C+3)/4, H * N] @@ -62,12 +147,13 @@ class CLImage { image_width_ = width; image_height_ = height; + image_dims_ = make_ddim({image_width_, image_height_}); std::unique_ptr imageData{}; int count = 0; - if (tensorInput != nullptr) { + if (tensor_data != nullptr) { imageData.reset(new half_t[width * height * 4]); - float *p = tensorInput; + float *p = tensor_data; size_t i0 = 0; for (int n = 0; n < N; n++) { for (int c = 0; c < C; c++) { @@ -108,39 +194,8 @@ class CLImage { // TODO(HaiPeng): error handling PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); } - - initialized_ = true; - } - - void Init(cl_context context, DDim ddim) { Init(context, nullptr, ddim); } - - inline CLImage &Resize(const DDim &dims) { - tensor_dims_ = dims; - return *this; - } - - const DDim &dims() const { return tensor_dims_; } - - cl_mem GetCLImage() const { return cl_image_; } - - template - T *data() const { - return reinterpret_cast(tensor_input_); } - inline int64_t numel() const { return product(tensor_dims_); } - - inline size_t ImageWidth() const { return image_width_; } - - inline size_t ImageHeight() const { return image_height_; } - - inline size_t CBlock() const { return c_block_; } - - inline size_t WidthOfOneBlock() const { return width_of_one_block_; } - - inline size_t HeightOfOneBlock() const { return height_of_one_block_; } - - private: bool initialized_ = false; cl_mem cl_image_; size_t image_width_; @@ -149,7 +204,8 @@ class CLImage { size_t image_height_; size_t c_block_; DDim tensor_dims_; - float *tensor_input_; + DDim image_dims_; + float *tensor_data_; cl_context context_; }; diff --git a/src/framework/cl/cl_scope.h b/src/framework/cl/cl_scope.h index 789ea548b5..24757db138 100644 --- a/src/framework/cl/cl_scope.h +++ b/src/framework/cl/cl_scope.h @@ -56,7 +56,8 @@ class CLScope { auto program = CLEngine::Instance()->CreateProgramWith( context_.get(), "./cl_kernel/" + file_name); - status_ = clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0); + status_ = + clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0); CL_CHECK_ERRORS(status_); programs_[file_name] = std::move(program); diff --git a/src/framework/cl/cl_tool.h b/src/framework/cl/cl_tool.h index 74a20f4818..25d5bfc584 100644 --- a/src/framework/cl/cl_tool.h +++ b/src/framework/cl/cl_tool.h @@ -21,12 +21,13 @@ namespace framework { const char* opencl_error_to_str(cl_int error); -#define CL_CHECK_ERRORS(ERR) \ - if (ERR != CL_SUCCESS) { \ - printf( \ - "OpenCL error with code %s happened in file %s at line %d. " \ - "Exiting.\n", \ - opencl_error_to_str(ERR), __FILE__, __LINE__); \ +#define CL_CHECK_ERRORS(ERR) \ + if (ERR != CL_SUCCESS) { \ + printf( \ + "OpenCL error with code %s happened in file %s at line %d. " \ + "Exiting.\n", \ + paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \ + __LINE__); \ } } // namespace framework diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index c8ef6763a2..3716e8d5df 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -928,7 +928,8 @@ void Executor::InitMemory() { framework::DDim ddim = framework::make_ddim(desc.Dims()); - cl_image->Init(context, tensorInput, ddim); + // has not init + cl_image->SetTensorData(tensorInput, ddim); delete origin_data; paddle_mobile::memory::Free(tensorInput); @@ -941,7 +942,7 @@ void Executor::InitMemory() { // framework::DDim ddim = framework::make_ddim(desc.Dims()); framework::DDim ddim = cl_image->dims(); DLOG << var_desc->Name(); - cl_image->Init(context, ddim); + cl_image->InitEmptyImage(context, ddim); } } } @@ -982,7 +983,10 @@ void Executor::InitCombineMemory() { float *tensorInput = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * numel)); LoadMemory(*var_desc, tensorInput, &origin_data); - cl_image->Init(context, tensorInput, ddim); + + // has not init + cl_image->SetTensorData(tensorInput, ddim); + paddle_mobile::memory::Free(tensorInput); } else { auto cl_image = var->template GetMutable(); @@ -991,8 +995,7 @@ void Executor::InitCombineMemory() { const framework::TensorDesc &desc = var_desc->Tensor_desc(); framework::DDim ddim = cl_image->dims(); // framework::DDim ddim = framework::make_ddim(desc.Dims()); - - cl_image->Init(context, ddim); + cl_image->InitEmptyImage(context, ddim); } } } diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 0890a45c9e..2f34b75b93 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -98,8 +98,8 @@ class FeedOp : public framework::OperatorBase { void Init() {} void RunImpl() { - param_.Out()->ShareDataWith(*param_.InputX()); - param_.Out()->set_lod(param_.InputX()->lod()); + param_.Out()->ShareDataWith(*param_.InputX()); + param_.Out()->set_lod(param_.InputX()->lod()); } protected: diff --git a/src/operators/kernel/cl/cl_kernel/cl_common.h b/src/operators/kernel/cl/cl_kernel/cl_common.h index c71967ccd6..95f0ab8208 100644 --- a/src/operators/kernel/cl/cl_kernel/cl_common.h +++ b/src/operators/kernel/cl/cl_kernel/cl_common.h @@ -18,9 +18,10 @@ limitations under the License. */ inline hafl4 activation(half4 in #ifdef PRELU - ,half4 prelu_alpha + , + half4 prelu_alpha #endif - ) { +) { half4 output; #ifdef PRELU output = select(prelu_alpha * in, in, in >= (half4)0.0); @@ -31,4 +32,3 @@ inline hafl4 activation(half4 in #endif return output; } - diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index fd846be802..29b13b6abc 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -16,6 +16,7 @@ limitations under the License. */ #include "operators/kernel/conv_add_bn_relu_kernel.h" #include "framework/cl/cl_image.h" +#include "framework/cl/cl_tool.h" namespace paddle_mobile { namespace operators { @@ -56,15 +57,15 @@ bool ConvAddBNReluKernel::Init( framework::CLImage *new_scale = new framework::CLImage(); - new_scale->Init(this->cl_helper_.CLContext(), new_scale_ptr, - variance->dims()); + new_scale->SetTensorData(new_scale_ptr, variance->dims()); + new_scale->InitCLImage(this->cl_helper_.CLContext()); framework::CLImage *new_bias = new framework::CLImage(); - new_bias->Init(this->cl_helper_.CLContext(), new_bias_ptr, variance->dims()); + new_bias->SetTensorData(new_bias_ptr, variance->dims()); + new_bias->InitCLImage(this->cl_helper_.CLContext()); param->SetNewScale(new_scale); - param->SetNewBias(new_bias); PADDLE_MOBILE_ENFORCE( @@ -115,26 +116,32 @@ void ConvAddBNReluKernel::Compute( int output_width = param.Output()->WidthOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock(); - clSetKernelArg(kernel, 0, sizeof(int), &c_block); - clSetKernelArg(kernel, 1, sizeof(int), &w); - clSetKernelArg(kernel, 2, sizeof(int), &nh); - clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); - clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); - clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); - clSetKernelArg(kernel, 9, sizeof(int), &stride); - clSetKernelArg(kernel, 10, sizeof(int), &offset); - clSetKernelArg(kernel, 11, sizeof(int), &input_c); - clSetKernelArg(kernel, 12, sizeof(int), &dilation); - clSetKernelArg(kernel, 13, sizeof(int), &input_width); - clSetKernelArg(kernel, 14, sizeof(int), &input_height); - clSetKernelArg(kernel, 15, sizeof(int), &output_width); - clSetKernelArg(kernel, 16, sizeof(int), &output_height); - - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); + status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + status = clSetKernelArg(kernel, 9, sizeof(int), &stride); + status = clSetKernelArg(kernel, 10, sizeof(int), &offset); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); + status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); + status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); + status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); + status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); + status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); + + CL_CHECK_ERRORS(status); + + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); } template class ConvAddBNReluKernel; diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index 696ae01bcc..b5fd82c47a 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -65,24 +65,31 @@ void ConvAddKernel::Compute( int output_width = param.Output()->WidthOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock(); - clSetKernelArg(kernel, 0, sizeof(int), &c_block); - clSetKernelArg(kernel, 1, sizeof(int), &w); - clSetKernelArg(kernel, 2, sizeof(int), &nh); - clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); - clSetKernelArg(kernel, 7, sizeof(int), &stride); - clSetKernelArg(kernel, 8, sizeof(int), &offset); - clSetKernelArg(kernel, 9, sizeof(int), &input_c); - clSetKernelArg(kernel, 10, sizeof(int), &dilation); - clSetKernelArg(kernel, 11, sizeof(int), &input_width); - clSetKernelArg(kernel, 12, sizeof(int), &input_height); - clSetKernelArg(kernel, 13, sizeof(int), &output_width); - clSetKernelArg(kernel, 14, sizeof(int), &output_height); - - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + + CL_CHECK_ERRORS(status); + + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + + CL_CHECK_ERRORS(status); } template class ConvAddKernel; diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index ee7b56629a..d31553b60e 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -21,63 +21,69 @@ namespace operators { template <> bool ConvKernel::Init(ConvParam *param) { - // PADDLE_MOBILE_ENFORCE( - // param->Filter()->dims()[2] == param->Filter()->dims()[3] && - // param->Paddings()[0] == param->Paddings()[1], - // "need equal"); - // int offset = static_cast(param->Filter()->dims()[2]) / 2 - - // static_cast(param->Paddings()[1]); - // param->SetOffset(offset); - // - // if (param->Filter()->WidthOfOneBlock() == 1 && - // param->Filter()->HeightOfOneBlock() == 1) { - // this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); - // } else if (param->Filter()->dims()[1] == 1) { - // this->cl_helper_.AddKernel("depth_conv_3x3", - // "conv_add_bn_relu_kernel.cl"); - // } else if (param->Filter()->WidthOfOneBlock() == 3 && - // param->Filter()->HeightOfOneBlock() == 3) { - // this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); - // } else { - // PADDLE_MOBILE_THROW_EXCEPTION(" not support "); - // } + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + if (param->Filter()->WidthOfOneBlock() == 1 && + param->Filter()->HeightOfOneBlock() == 1) { + this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + } else if (param->Filter()->dims()[1] == 1) { + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + } else if (param->Filter()->WidthOfOneBlock() == 3 && + param->Filter()->HeightOfOneBlock() == 3) { + this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } return true; } template <> void ConvKernel::Compute(const ConvParam ¶m) { - // auto kernel = this->cl_helper_.KernelAt(0); - // auto default_work_size = - // this->cl_helper_.DefaultWorkSize(*param.Output()); int c_block = - // default_work_size[0]; int w = default_work_size[1]; int nh = - // default_work_size[2]; auto input = param.Input()->GetCLImage(); auto - // filter = param.Filter()->GetCLImage(); auto output = param.Output(); int - // stride = param.Strides()[0]; int offset = param.Offset(); int input_c = - // param.Input()->CBlock(); int dilation = param.Dilations()[0]; int - // input_width = param.Input()->WidthOfOneBlock(); int input_height = - // param.Input()->HeightOfOneBlock(); - // - // clSetKernelArg(kernel, 0, sizeof(int), &c_block); - // clSetKernelArg(kernel, 1, sizeof(int), &w); - // clSetKernelArg(kernel, 2, sizeof(int), &nh); - // clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - // clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - // clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); - // clSetKernelArg(kernel, 6, sizeof(int), &stride); - // clSetKernelArg(kernel, 7, sizeof(int), &offset); - // clSetKernelArg(kernel, 8, sizeof(int), &input_c); - // clSetKernelArg(kernel, 9, sizeof(int), &dilation); - // clSetKernelArg(kernel, 10, sizeof(int), &input_width); - // clSetKernelArg(kernel, 11, sizeof(int), &input_height); - // - // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - // default_work_size.data(), NULL, 0, NULL, NULL); - - // auto kernel = this->cl_helper_.KernelAt(0); - // size_t global_work_size[3] = {1, 2, 3}; - // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - // global_work_size, NULL, 0, NULL, NULL); + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto output = param.Output(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; + int input_width = param.Input()->WidthOfOneBlock(); + int input_height = param.Input()->HeightOfOneBlock(); + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + status = clSetKernelArg(kernel, 6, sizeof(int), &stride); + status = clSetKernelArg(kernel, 7, sizeof(int), &offset); + status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); + status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); + status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); + + CL_CHECK_ERRORS(status); + + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + + CL_CHECK_ERRORS(status); } template class ConvKernel; diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp index 73ab8d7e1e..99b5a714d6 100644 --- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -24,9 +24,9 @@ template <> bool DepthwiseConvKernel::Init(ConvParam *param) { DLOG << " depthwise conv kernel init begin "; PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Paddings()[0] == param->Paddings()[1], - "need equal"); + "need equal"); int offset = static_cast(param->Filter()->dims()[2]) / 2 - static_cast(param->Paddings()[1]); param->SetOffset(offset); @@ -36,7 +36,8 @@ bool DepthwiseConvKernel::Init(ConvParam *param) { } template <> -void DepthwiseConvKernel::Compute(const ConvParam ¶m) { +void DepthwiseConvKernel::Compute( + const ConvParam ¶m) { auto kernel = this->cl_helper_.KernelAt(0); auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); int c_block = default_work_size[0]; @@ -54,23 +55,30 @@ void DepthwiseConvKernel::Compute(const ConvParam ¶m) int output_width = param.Output()->WidthOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock(); - clSetKernelArg(kernel, 0, sizeof(int), &c_block); - clSetKernelArg(kernel, 1, sizeof(int), &w); - clSetKernelArg(kernel, 2, sizeof(int), &nh); - clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); - clSetKernelArg(kernel, 6, sizeof(int), &stride); - clSetKernelArg(kernel, 7, sizeof(int), &offset); - clSetKernelArg(kernel, 8, sizeof(int), &input_c); - clSetKernelArg(kernel, 9, sizeof(int), &dilation); - clSetKernelArg(kernel, 10, sizeof(int), &input_width); - clSetKernelArg(kernel, 11, sizeof(int), &input_height); - clSetKernelArg(kernel, 12, sizeof(int), &output_width); - clSetKernelArg(kernel, 13, sizeof(int), &output_height); - - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + status = clSetKernelArg(kernel, 6, sizeof(int), &stride); + status = clSetKernelArg(kernel, 7, sizeof(int), &offset); + status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); + status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); + status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); + status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); + + CL_CHECK_ERRORS(status); + + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + + CL_CHECK_ERRORS(status); } template class DepthwiseConvKernel; @@ -78,4 +86,4 @@ template class DepthwiseConvKernel; } // namespace operators } // namespace paddle_mobile -#endif \ No newline at end of file +#endif diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp index 13873c1612..ef9d28a918 100644 --- a/src/operators/kernel/cl/feed_kernel.cpp +++ b/src/operators/kernel/cl/feed_kernel.cpp @@ -12,42 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "common/log.h" #include "operators/kernel/feed_kernel.h" +#include "common/log.h" namespace paddle_mobile { - namespace operators { - - template <> - bool FeedKernel::Init(FeedParam *param) { - DLOG<<"Init feed"; - this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); - return true; - } - - template <> - void FeedKernel::Compute(const FeedParam ¶m) { - - DLOG<<"feed_kernel"; - auto kernel = this->cl_helper_.KernelAt(0); - cl_int status; - auto output = param.Out(); - auto input = param.InputX(); - const float *input_data = input->data(); - cl_mem cl_image = output->GetCLImage(); - int height = output->dims()[2]; - int width = output->dims()[3]; - status = clSetKernelArg(kernel,0, sizeof(cl_mem),&input_data); - status = clSetKernelArg(kernel,0, sizeof(cl_mem),&cl_image); - status = clSetKernelArg(kernel,0, sizeof(cl_mem),&width); - status = clSetKernelArg(kernel,0, sizeof(cl_mem),&height); - - size_t global_work_size[2] = {height,width}; - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - } - - template class FeedKernel; - - } // namespace operators +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + DLOG << "Init feed"; + this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + DLOG << "feed_kernel"; + auto kernel = this->cl_helper_.KernelAt(0); + cl_int status; + auto output = param.Out(); + auto input = param.InputX(); + DLOG << " input: " << input; + + const float *input_data = input->data(); + cl_mem cl_image = output->GetCLImage(); + int height = output->dims()[2]; + int width = output->dims()[3]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height); + + size_t global_work_size[2] = {height, width}; + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, + global_work_size, NULL, 0, NULL, NULL); +} + +template class FeedKernel; + +} // namespace operators } // namespace paddle_mobile - diff --git a/src/operators/kernel/cl/relu_kernel.cpp b/src/operators/kernel/cl/relu_kernel.cpp index d571ec2a92..71304b9c30 100644 --- a/src/operators/kernel/cl/relu_kernel.cpp +++ b/src/operators/kernel/cl/relu_kernel.cpp @@ -19,13 +19,13 @@ namespace paddle_mobile { namespace operators { template <> -bool ReluKernel::Init(ReluParam *param) { +bool ReluKernel::Init(ReluParam* param) { this->cl_helper_.AddKernel("relu", "relu.cl"); return true; } template <> -void ReluKernel::Compute(const ReluParam ¶m) { +void ReluKernel::Compute(const ReluParam& param) { auto kernel = this->cl_helper_.KernelAt(0); const auto* input = param.InputX(); auto* output = param.Out(); @@ -34,7 +34,7 @@ void ReluKernel::Compute(const ReluParam ¶m) { auto outputImage = output->GetCLImage(); clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - const size_t work_size[2] = { input->ImageWidth(), input->ImageHeight() }; + const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, work_size, NULL, 0, NULL, NULL); } @@ -43,4 +43,4 @@ template class ReluKernel; } // namespace operators } // namespace paddle_mobile -#endif \ No newline at end of file +#endif diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp index 1738c62d23..877a325636 100644 --- a/src/operators/kernel/cl/reshape_kernel.cpp +++ b/src/operators/kernel/cl/reshape_kernel.cpp @@ -25,30 +25,29 @@ bool ReshapeKernel::Init(ReshapeParam *param) { template <> void ReshapeKernel::Compute(const ReshapeParam ¶m) { -auto kernel = this->cl_helper_.KernelAt(0); - const auto * input = param.InputX(); - auto * output = param.Out(); + auto kernel = this->cl_helper_.KernelAt(0); + const auto *input = param.InputX(); + auto *output = param.Out(); auto inputImage = input->GetCLImage(); auto outputImage = output->GetCLImage(); clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - const auto & inputDim = input->dims(); - const auto & outputDim = output->dims(); + const auto &inputDim = input->dims(); + const auto &outputDim = output->dims(); int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]}; int odims[4] = {outputDim[0], outputDim[1], outputDim[2], outputDim[3]}; clSetKernelArg(kernel, 2, sizeof(int), dims); - clSetKernelArg(kernel, 3, sizeof(int), dims+1); - clSetKernelArg(kernel, 4, sizeof(int), dims+2); - clSetKernelArg(kernel, 5, sizeof(int), dims+3); + clSetKernelArg(kernel, 3, sizeof(int), dims + 1); + clSetKernelArg(kernel, 4, sizeof(int), dims + 2); + clSetKernelArg(kernel, 5, sizeof(int), dims + 3); clSetKernelArg(kernel, 6, sizeof(int), odims); - clSetKernelArg(kernel, 7, sizeof(int), odims+1); - clSetKernelArg(kernel, 8, sizeof(int), odims+2); - clSetKernelArg(kernel, 9, sizeof(int), odims+3); - const size_t work_size[2] = { output->ImageWidth(), output->ImageHeight() }; + clSetKernelArg(kernel, 7, sizeof(int), odims + 1); + clSetKernelArg(kernel, 8, sizeof(int), odims + 2); + clSetKernelArg(kernel, 9, sizeof(int), odims + 3); + const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()}; clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); - } template class ReshapeKernel; diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp index 44fadcc283..1404ea40c7 100644 --- a/src/operators/kernel/cl/softmax_kernel.cpp +++ b/src/operators/kernel/cl/softmax_kernel.cpp @@ -29,18 +29,18 @@ template <> void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { auto kernel = this->cl_helper_.KernelAt(0); auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); - const auto * input = param.InputX(); - auto * output = param.Out(); + const auto *input = param.InputX(); + auto *output = param.Out(); auto inputImage = input->GetCLImage(); auto outputImage = output->GetCLImage(); clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - const auto & inputDim = input->dims(); + const auto &inputDim = input->dims(); int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]}; clSetKernelArg(kernel, 2, sizeof(int), dims); - clSetKernelArg(kernel, 3, sizeof(int), dims+1); - clSetKernelArg(kernel, 4, sizeof(int), dims+2); - clSetKernelArg(kernel, 5, sizeof(int), dims+3); + clSetKernelArg(kernel, 3, sizeof(int), dims + 1); + clSetKernelArg(kernel, 4, sizeof(int), dims + 2); + clSetKernelArg(kernel, 5, sizeof(int), dims + 3); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/feed_kernel.h b/src/operators/kernel/feed_kernel.h index ed287221ab..2b1220fee5 100644 --- a/src/operators/kernel/feed_kernel.h +++ b/src/operators/kernel/feed_kernel.h @@ -18,15 +18,15 @@ limitations under the License. */ #include "operators/op_param.h" namespace paddle_mobile { - namespace operators { - using namespace framework; - template - class FeedKernel - : public framework::OpKernelBase>{ - public: - void Compute(const FeedParam ¶m); - bool Init(FeedParam *param); - }; +namespace operators { +using namespace framework; +template +class FeedKernel + : public framework::OpKernelBase> { + public: + void Compute(const FeedParam ¶m); + bool Init(FeedParam *param); +}; - } // namespace operators +} // namespace operators } // namespace paddle_mobile diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index a2f030eeac..9e826d3a74 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -29,8 +29,8 @@ int main() { bool optimize = true; auto time1 = time(); if (paddle_mobile.Load(g_googlenet, optimize)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl; std::vector input; std::vector dims{1, 3, 224, 224}; GetInput(g_test_image_1x3x224x224, &input, dims); diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp index f0994855fa..f65e1890f3 100644 --- a/test/net/test_mobilenet_GPU.cpp +++ b/test/net/test_mobilenet_GPU.cpp @@ -19,14 +19,14 @@ limitations under the License. */ int main() { paddle_mobile::PaddleMobile paddle_mobile; // paddle_mobile.SetThreadNum(4); - auto time1 = time(); + auto time1 = paddle_mobile::time(); // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", // std::string(g_mobilenet_detect) + "/params", true); auto isok = paddle_mobile.Load(g_mobilenet, false); if (isok) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" << std::endl; std::vector input; std::vector dims{1, 3, 224, 224}; @@ -42,13 +42,13 @@ int main() { for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); } - auto time3 = time(); + auto time3 = paddle_mobile::time(); for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); } DLOG << vec_result; - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" + auto time4 = paddle_mobile::time(); + std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 << "ms" << std::endl; } -- GitLab