diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp index 94d5bb86026944c0623941965e5217b743d329c1..e59166df4c39d549ba62787175df1b2bead58907 100644 --- a/src/framework/cl/cl_image.cpp +++ b/src/framework/cl/cl_image.cpp @@ -14,110 +14,107 @@ limitations under the License. */ #include "cl_image.h" namespace paddle_mobile { - namespace framework { - void CLImageToTensor(CLImage *cl_image, Tensor *tensor,cl_command_queue commandQueue){ - - DDim ddim = cl_image->dims(); - size_t N,C,H,W; - if(ddim.size()==4){ - N = ddim[0]; - if(N<0){ - N = 1; - } - C = ddim[1]; - H = ddim[2]; - W = ddim[3]; - }else if(ddim.size()==1){ - N = 1; - C = ddim[0]; - H = 1; - W = 1; - } - - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - float *p = tensor->data(); - half imageData[width * height * 4]; - cl_int err; - cl_mem image = cl_image->GetCLImage(); - size_t origin[3] = {0,0,0}; - size_t region[3] = {width,height,1}; - err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL); - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - size_t i1 = i0; - for (int h = 0; h < H; h++) { - size_t i2 = (i1<<2) + c % 4; - for (int w = 0; w < W; w++) { - *p = half2float(imageData[i2]); - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } - - - if (err != CL_SUCCESS) { - // TODO: error handling - } - +namespace framework { +void CLImageToTensor(CLImage *cl_image, Tensor *tensor, + cl_command_queue commandQueue) { + DDim ddim = cl_image->dims(); + size_t N, C, H, W; + if (ddim.size() == 4) { + N = ddim[0]; + if (N < 0) { + N = 1; + } + C = ddim[1]; + H = ddim[2]; + W = ddim[3]; + } else if (ddim.size() == 1) { + N = 1; + C = ddim[0]; + H = 1; + W = 1; + } + + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + float *p = tensor->data(); + half imageData[width * height * 4]; + cl_int err; + cl_mem image = cl_image->GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0, + imageData, 0, NULL, NULL); + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = half2float(imageData[i2]); + i2 += 4; + p++; } - void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,cl_command_queue commandQueue){ - - DDim ddim = cl_image->dims(); - size_t N,C,H,W; - if(ddim.size()==4){ - N = ddim[0]; - if(N<0){ - N = 1; - } - C = ddim[1]; - H = ddim[2]; - W = ddim[3]; - }else if(ddim.size()==1){ - N = 1; - C = ddim[0]; - H = 1; - W = 1; - } - - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - const float *p = tensor->data(); - half imageData[width * height * 4]; - cl_mem image = cl_image->GetCLImage(); - size_t origin[3] = {0,0,0}; - size_t region[3] = {width,height,1}; - cl_int err; - err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL); - if (err != CL_SUCCESS) { - // TODO: error handling - } - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - size_t i1 = i0; - for (int h = 0; h < H; h++) { - size_t i2 = (i1<<2) + c % 4; - for (int w = 0; w < W; w++) { - imageData[i2] = float2half(*p); - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } - + i1 += width; + } + } + i0 += width * H; + } + if (err != CL_SUCCESS) { + // TODO: error handling + } +} +void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, + cl_command_queue commandQueue) { + DDim ddim = cl_image->dims(); + size_t N, C, H, W; + if (ddim.size() == 4) { + N = ddim[0]; + if (N < 0) { + N = 1; + } + C = ddim[1]; + H = ddim[2]; + W = ddim[3]; + } else if (ddim.size() == 1) { + N = 1; + C = ddim[0]; + H = 1; + W = 1; + } + + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + const float *p = tensor->data(); + half imageData[width * height * 4]; + cl_mem image = cl_image->GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + cl_int err; + err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0, + imageData, 0, NULL, NULL); + if (err != CL_SUCCESS) { + // TODO: error handling + } + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + imageData[i2] = float2half(*p); + i2 += 4; + p++; } + i1 += width; + } } + i0 += width * H; + } } - +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index ad86713f9056daaa85ba1b5c21ea7efeade83924..d2419b206aef22730d56343b0f902fa5adab30de 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -30,6 +30,20 @@ class CLImage { void Init(cl_context context, float *tensorInput, DDim ddim) { tensor_dims_ = ddim; + if (tensorInput) { + tensor_input_ = tensorInput; + } else { + int numel = 1; + for (int i = 0; i < ddim.size(); i++) { + numel *= ddim[i]; + } + tensor_input_ = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * numel)); + for (int i = 0; i < numel; i++) { + tensor_input_[i] = 0; + } + } + cl_image_format cf = {.image_channel_order = CL_RGBA, .image_channel_data_type = CL_HALF_FLOAT}; // NCHW -> [W * (C+3)/4, H * N] @@ -65,9 +79,9 @@ class CLImage { std::unique_ptr imageData{}; int count = 0; - if (tensorInput != nullptr) { - imageData.reset(new half_t[width * height * 4]); - float *p = tensorInput; + imageData.reset(new half_t[width * height * 4]); + if (tensor_input_ != nullptr) { + float *p = tensor_input_; size_t i0 = 0; for (int n = 0; n < N; n++) { for (int c = 0; c < C; c++) { @@ -75,11 +89,13 @@ class CLImage { for (int h = 0; h < H; h++) { size_t i2 = (i1 << 2) + c % 4; for (int w = 0; w < W; w++) { - if (i2 >= width * height * 4) { - printf("%d > %d ----> %d, %d, %d, %d --- %d, %d, %d\n", i2, - width * height * 4, n, c, h, w, i0, i1, i2); - } - assert(i2 < width * height * 4); + // if (i2 >= width * height * 4) { + // printf("%d > %d ----> %d, %d, %d, %d --- %d, %d, + // %d\n", i2, + // width * height * 4, n, c, h, w, i0, i1, + // i2); + // } + // assert(i2 < width * height * 4); imageData[i2] = float2half(*p); i2 += 4; @@ -153,9 +169,11 @@ class CLImage { cl_context context_; }; -void TensorToCLImage(Tensor *tensor, CLImage *image); +void TensorToCLImage(Tensor *tensor, CLImage *image, + cl_command_queue commandQueue); -void CLImageToTensor(CLImage *image, Tensor *tensor); +void CLImageToTensor(CLImage *image, Tensor *tensor, + cl_command_queue commandQueue); } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/cl/cl_scope.h b/src/framework/cl/cl_scope.h index 789ea548b576fa6e5ea57225d036f9fee08a8dd4..24757db138a2dc62d82ffe837b939f50bcbfe0f1 100644 --- a/src/framework/cl/cl_scope.h +++ b/src/framework/cl/cl_scope.h @@ -56,7 +56,8 @@ class CLScope { auto program = CLEngine::Instance()->CreateProgramWith( context_.get(), "./cl_kernel/" + file_name); - status_ = clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0); + status_ = + clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0); CL_CHECK_ERRORS(status_); programs_[file_name] = std::move(program); diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index c8ef6763a292ab679b9af83ff5a71e0fe66368ca..ef76623bfd72b91935a5c3ffc39e8ebf37906b1c 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -931,7 +931,7 @@ void Executor::InitMemory() { cl_image->Init(context, tensorInput, ddim); delete origin_data; - paddle_mobile::memory::Free(tensorInput); + // paddle_mobile::memory::Free(tensorInput); } else { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { auto cl_image = var->template GetMutable(); diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index fa04ac2e464a2dfa9c79b8e338ca66f5b0d6978f..68ce52114cedd4267ef7fa40f9043bc4363bda9f 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -72,13 +72,16 @@ void OperatorBase::Run() { if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; } else { CLImage *cl_image = vari->template GetMutable(); - // cl_command_queue commandQueue = - // scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ; - // CLImageToTensor(cl_image,tmp,commandQueue); - // tmp->Resize(cl_image->dims()); + // cl_command_queue commandQueue = + // scope_->GetCLScpoe()->CommandQueue(); Tensor + // *tmp ; + // CLImageToTensor(cl_image,tmp,commandQueue); + // tmp->Resize(cl_image->dims()); + const float *input = cl_image->data(); if (cl_image) { - // DLOG<template GetMutable(); + if (tensor) + DLOG << type_ << " output- " << key << "=" << tensor->dims(); + } else { + CLImage *cl_image = vari->template GetMutable(); + // cl_command_queue commandQueue = + // scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ; + // CLImageToTensor(cl_image,tmp,commandQueue); + // tmp->Resize(cl_image->dims()); + if (cl_image) { + const float *output = cl_image->data(); + DLOG << type_ << " output- " << key << "=" << cl_image->dims(); + // if(output) + // DLOG<Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Paddings()[0] == param->Paddings()[1], - "need equal"); + "need equal"); int offset = static_cast(param->Filter()->dims()[2]) / 2 - static_cast(param->Paddings()[1]); param->SetOffset(offset); @@ -36,7 +36,8 @@ bool DepthwiseConvKernel::Init(ConvParam *param) { } template <> -void DepthwiseConvKernel::Compute(const ConvParam ¶m) { +void DepthwiseConvKernel::Compute( + const ConvParam ¶m) { auto kernel = this->cl_helper_.KernelAt(0); auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); int c_block = default_work_size[0]; @@ -78,4 +79,4 @@ template class DepthwiseConvKernel; } // namespace operators } // namespace paddle_mobile -#endif \ No newline at end of file +#endif diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp index 13873c1612380e5f98c93ca81b30be74bd962e33..4cbdd6e5d1e872b2d98b0653f3172800deb8c9cb 100644 --- a/src/operators/kernel/cl/feed_kernel.cpp +++ b/src/operators/kernel/cl/feed_kernel.cpp @@ -12,42 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "common/log.h" #include "operators/kernel/feed_kernel.h" +#include "common/log.h" namespace paddle_mobile { - namespace operators { - - template <> - bool FeedKernel::Init(FeedParam *param) { - DLOG<<"Init feed"; - this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); - return true; - } - - template <> - void FeedKernel::Compute(const FeedParam ¶m) { - - DLOG<<"feed_kernel"; - auto kernel = this->cl_helper_.KernelAt(0); - cl_int status; - auto output = param.Out(); - auto input = param.InputX(); - const float *input_data = input->data(); - cl_mem cl_image = output->GetCLImage(); - int height = output->dims()[2]; - int width = output->dims()[3]; - status = clSetKernelArg(kernel,0, sizeof(cl_mem),&input_data); - status = clSetKernelArg(kernel,0, sizeof(cl_mem),&cl_image); - status = clSetKernelArg(kernel,0, sizeof(cl_mem),&width); - status = clSetKernelArg(kernel,0, sizeof(cl_mem),&height); - - size_t global_work_size[2] = {height,width}; - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); - } - - template class FeedKernel; - - } // namespace operators +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + DLOG << "Init feed"; + this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + cl_int status; + auto output = param.Out(); + const Tensor *input = param.InputX(); + const float *input_data = nullptr; + input_data = input->data(); + + cl_mem cl_image = output->GetCLImage(); + int height = output->dims()[2]; + int width = output->dims()[3]; + DLOG << output->dims(); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height); + + size_t global_work_size[2] = {height, width}; + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, + global_work_size, NULL, 0, NULL, NULL); +} + +template class FeedKernel; + +} // namespace operators } // namespace paddle_mobile - diff --git a/src/operators/kernel/feed_kernel.h b/src/operators/kernel/feed_kernel.h index ed287221abd532a128f791828ea4c31a212aff34..2b1220fee534040e5ccae5aee84adf3b4b6290b9 100644 --- a/src/operators/kernel/feed_kernel.h +++ b/src/operators/kernel/feed_kernel.h @@ -18,15 +18,15 @@ limitations under the License. */ #include "operators/op_param.h" namespace paddle_mobile { - namespace operators { - using namespace framework; - template - class FeedKernel - : public framework::OpKernelBase>{ - public: - void Compute(const FeedParam ¶m); - bool Init(FeedParam *param); - }; +namespace operators { +using namespace framework; +template +class FeedKernel + : public framework::OpKernelBase> { + public: + void Compute(const FeedParam ¶m); + bool Init(FeedParam *param); +}; - } // namespace operators +} // namespace operators } // namespace paddle_mobile diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 1a1f910d11885fd31f3ff3454ea5782b0ced4eb6..689eec0da950fbc8a1e7892c0740bff1790fd1ab 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -936,14 +936,14 @@ class FetchParam : public OpParam { FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, const Scope &scope) { input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + out_ = OutFrom(outputs, scope); } const RType *InputX() const { return input_x_; } - RType *Out() const { return out_; } + Tensor *Out() const { return out_; } private: RType *input_x_; - RType *out_; + Tensor *out_; }; #ifdef TRANSPOSE_OP