diff --git a/CMakeLists.txt b/CMakeLists.txt index 020a9179d8a8c0127a4baf0ff51273a0337515a4..b40aee8088bb6889858aa76776e6b060030de7c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,6 @@ cmake_minimum_required(VERSION 3.6) -option(USE_OPENMP "openmp support" OFF) - -project(paddle-mobile) +option(USE_OPENMP "openmp support" OFF) option(DEBUGING "enable debug mode" ON) option(USE_EXCEPTION "use std exception" ON) option(LOG_PROFILE "log profile" OFF) @@ -12,6 +10,8 @@ option(GPU_MALI "mali gpu" OFF) option(GPU_CL "opencl gpu" ON) option(FPGA "fpga" OFF) +project(paddle-mobile) + file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) include_directories(src/) diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h index 2bfc6ae6b3051753a1e831c898110db324288187..f9f373b2a74087960b03c55ec922f95f187cfbc4 100644 --- a/src/framework/cl/cl_engine.h +++ b/src/framework/cl/cl_engine.h @@ -90,7 +90,8 @@ class CLEngine { bool BuildProgram(cl_program program) { cl_int status; - status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel", 0, 0); + status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel", + 0, 0); CL_CHECK_ERRORS(status); @@ -98,7 +99,7 @@ class CLEngine { size_t log_size; clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char *log = (char *)malloc(log_size); + char *log = reinterpret_cast(malloc(log_size)); clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), CL_PROGRAM_BUILD_LOG, log_size, log, NULL); DLOG << " program build error: " << log; diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h index 6f3f83e272281b0446ca8f39ef54d52085f6c895..bea91ee24ceb5e9011708bd277629a07beb4b8ef 100644 --- a/src/framework/cl/cl_helper.h +++ b/src/framework/cl/cl_helper.h @@ -49,9 +49,6 @@ class CLHelper { cl_context CLContext() { return scope_->Context(); } std::vector DefaultWorkSize(const CLImage &image) { - if (image.GetImageType() == Invalid) { - PADDLE_MOBILE_THROW_EXCEPTION(" not support image type"); - } // n c h w auto image_dim = image.dims(); if (image_dim.size() == 4) { @@ -66,7 +63,7 @@ class CLHelper { } else if (image_dim.size() == 2) { return {1, image.ImageWidth(), image.ImageHeight()}; } else if (image_dim.size() == 1) { - return {1, image.ImageWidth(), 1}; + return {1, image.ImageWidth(), 1}; } PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp "); } diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp index 2cb4f4ecea06cc9d863dd7fe7432cce2b651e75e..f32de0a61461d9a9b28d4a0cf5e13ecc9d564cf5 100644 --- a/src/framework/cl/cl_image.cpp +++ b/src/framework/cl/cl_image.cpp @@ -16,214 +16,47 @@ limitations under the License. */ namespace paddle_mobile { namespace framework { + void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_command_queue commandQueue) { - DDim ddim = cl_image->dims(); - size_t N, C, H, W; - if (ddim.size() == 4) { - N = ddim[0]; - if (N < 0) { - N = 1; - } - C = ddim[1]; - H = ddim[2]; - W = ddim[3]; - } else if (ddim.size() == 1) { - N = 1; - C = ddim[0]; - H = 1; - W = 1; - } - - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - float *p = tensor->mutable_data(); - half imageData[width * height * 4]; - cl_int err; - cl_mem image = cl_image->GetCLImage(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {width, height, 1}; - err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0, - imageData, 0, NULL, NULL); - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - size_t i1 = i0; - for (int h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (int w = 0; w < W; w++) { - *p = Half2Float(imageData[i2]); - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } - - if (err != CL_SUCCESS) { - CL_CHECK_ERRORS(err); - } + // TODO(yangfei): need imp } + void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, cl_command_queue commandQueue) { - DDim ddim = cl_image->dims(); - size_t N, C, H, W; - if (ddim.size() == 4) { - N = ddim[0]; - if (N < 0) { - N = 1; - } - C = ddim[1]; - H = ddim[2]; - W = ddim[3]; - } else if (ddim.size() == 1) { - N = 1; - C = ddim[0]; - H = 1; - W = 1; - } - - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - const float *p = tensor->data(); - half imageData[width * height * 4]; - cl_mem image = cl_image->GetCLImage(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {width, height, 1}; - cl_int err; - err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0, - imageData, 0, NULL, NULL); - if (err != CL_SUCCESS) { - CL_CHECK_ERRORS(err); - } - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - size_t i1 = i0; - for (int h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (int w = 0; w < W; w++) { - imageData[i2] = Float2Half(*p); - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } + // TODO(yangfei): need imp } + #ifdef PADDLE_MOBILE_DEBUG Print &operator<<(Print &printer, const CLImage &cl_image) { - if (cl_image.GetImageType() == Invalid) { - PADDLE_MOBILE_THROW_EXCEPTION(" not support image type"); - } - printer << " dims: " << cl_image.dims() << "\n"; - int stride = cl_image.numel() / 20; - stride = stride > 0 ? stride : 1; - float *data = new float[cl_image.numel()]; - DDim ddim = cl_image.dims(); - size_t N, C, H, W, width, height; - - if (cl_image.GetImageType() == Normal || cl_image.dims().size() == 3 || - cl_image.dims().size() == 4) { - if (ddim.size() == 4) { - N = ddim[0]; - if (N < 0) { - N = 1; - } - C = ddim[1]; - H = ddim[2]; - W = ddim[3]; - width = W * ((C + 3) / 4); - height = N * H; - } else if (ddim.size() == 2) { - width = ddim[1]; - height = ddim[0]; - N = 1; - C = 1; - H = ddim[0]; - W = ddim[1]; - } else if (ddim.size() == 1) { - width = ddim[0]; - height = 1; - N = 1; - C = 1; - H = 1; - W = ddim[0]; - } - float *p = data; - half *imageData = new half[height * width * 4]; - cl_int err; - cl_mem image = cl_image.GetCLImage(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {width, height, 1}; - err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, - region, 0, 0, imageData, 0, NULL, NULL); - - if (err != CL_SUCCESS) { - printf("ImageWidth %ld \n", cl_image.ImageWidth()); - printf("ImageWidth %ld \n", cl_image.ImageHeight()); - } - - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - size_t i1 = i0 + (c / 4) * W; - for (int h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (int w = 0; w < W; w++) { - *p = Half2Float(imageData[i2]); - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } - delete (imageData); - CL_CHECK_ERRORS(err); + int width = cl_image.ImageDims()[0]; + int height = cl_image.ImageDims()[1]; - } else { - if (ddim.size() == 2) { - width = (ddim[1] + 3) / 4; - height = ddim[0]; - H = ddim[0]; - W = ddim[1]; + half_t *image_data = new half_t[height * width * 4]; + cl_int err; + cl_mem image = cl_image.GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, + region, 0, 0, image_data, 0, NULL, NULL); - } else if (ddim.size() == 1) { - width = (ddim[0] + 3) / 4; - height = 1; - H = 1; - W = ddim[0]; - } - float *p = data; - half *imageData = new half[width * height * 4]; - cl_int err; - cl_mem image = cl_image.GetCLImage(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {width, height, 1}; - err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, - region, 0, 0, imageData, 0, NULL, NULL); - for (int h = 0; h < H; h++) { - for (int w = 0; w < W; w++) { - p[h * W + w] = Half2Float(imageData[(h * width + w / 4) * 4 + (w % 4)]); - } - } + CL_CHECK_ERRORS(err); - delete (imageData); - CL_CHECK_ERRORS(err); - } + float *tensor_data = new float[cl_image.numel()]; + auto converter = cl_image.Converter(); + converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(), + cl_image.dims()); + int stride = cl_image.numel() / 20; + stride = stride > 0 ? stride : 1; + printer << " dims: " << cl_image.dims() << "\n"; for (int i = 0; i < cl_image.numel(); i += stride) { - printer << data[i] << " "; + printer << tensor_data[i] << " "; } - delete (data); + + delete[](tensor_data); + delete[](image_data); + return printer; } #endif diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index b88091e09b03940a97170941ea0c48b2874260f1..90c55aab6655070d907091da69b5dfdbd0a34aa9 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -18,22 +18,30 @@ limitations under the License. */ #include "CL/cl.h" -#include "framework/cl/cl_half.h" -#include "framework/cl/cl_tool.h" #include "framework/cl/cl_deleter.h" #include "framework/cl/cl_engine.h" +#include "framework/cl/cl_half.h" +#include "framework/cl/cl_image_converter.h" +#include "framework/cl/cl_tool.h" #include "framework/ddim.h" #include "framework/tensor.h" namespace paddle_mobile { namespace framework { -enum ImageType { Invalid = -1, Normal = 0, Folder = 1 }; - class CLImage { public: CLImage() = default; + ~CLImage() { + if (tensor_data_ != nullptr) { + delete[](tensor_data_); + } + + if (image_converter_) { + delete (image_converter_); + } + } /* * will not hold input tensor data, memcpy in this method * */ @@ -54,79 +62,79 @@ class CLImage { * folder when one dim or two dim * */ void InitCLImage(cl_context context, cl_command_queue command_queue) { - if (tensor_data_ == nullptr) { - PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); - } - DLOG << tensor_dims_; - if (tensor_dims_.size() <= 2) { - DLOG << " dim <= 2 folder ~~~~~ "; - InitCLImage2C(context, command_queue, tensor_data_, tensor_dims_); - } else { - DLOG << " dim > 2 norm ~~~~~ "; - InitCLImage(context, command_queue, tensor_data_, tensor_dims_); + PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, + " need call SetTensorData first"); + CLImageConverterFolder *folder_converter = new CLImageConverterFolder(); + InitCLImage(context, command_queue, folder_converter); + } + + void InitCLImage(cl_context context, cl_command_queue command_queue, + CLImageConverterBase *converter) { + if (image_converter_ != nullptr) { + delete (image_converter_); } + + PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, + " need call SetTensorData first"); + + DLOG << " begin init cl image "; + image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); + + half_t *image_data = new half_t[product(image_dims_) * 4]; + + DLOG << " convert to image"; + converter->NCHWToImage(tensor_data_, image_data, tensor_dims_); + DLOG << " end convert to image"; + + InitCLImage(context, image_dims_[0], image_dims_[1], image_data); + + delete[](image_data); delete[](tensor_data_); + + command_queue_ = command_queue; tensor_data_ = nullptr; + image_converter_ = converter; initialized_ = true; + DLOG << " end init cl image"; } - /* - * need call SetTensorData first - * */ - void InitCLImageNormal(cl_context context, cl_command_queue command_queue) { + void InitNImage(cl_context context, cl_command_queue command_queue) { if (tensor_data_ == nullptr) { PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); } - InitCLImage(context, command_queue, tensor_data_, tensor_dims_); - delete[](tensor_data_); - tensor_data_ = nullptr; - initialized_ = true; + CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock(); + InitCLImage(context, command_queue, folder_converter); + PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4"); } void InitEmptyImage(cl_context context, cl_command_queue command_queue, const DDim &dim) { - if (tensor_data_ != nullptr) { - PADDLE_MOBILE_THROW_EXCEPTION( - " empty image tensor data shouldn't have value"); - } - DLOG << " init empty image "; - if (tensor_dims_.size() <= 2) { - DLOG << " dim <= 2 folder ~~~~~ "; - InitCLImage2C(context, command_queue, tensor_data_, tensor_dims_); - } else { - DLOG << " dim > 2 norm ~~~~~ "; - InitCLImage(context, command_queue, tensor_data_, tensor_dims_); - } + PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, + " empty image tensor data shouldn't have value"); - cl_event_ = CLEngine::Instance()->CreateEvent(context); + CLImageConverterFolder *folder_converter = new CLImageConverterFolder(); + + DLOG << " to get image dims "; + image_dims_ = folder_converter->InitImageDimInfoWith(dim); + DLOG << " end get image dims " << image_dims_; + InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); -// InitCLImage(context, command_queue, nullptr, dim); + tensor_dims_ = dim; + command_queue_ = command_queue; + image_converter_ = folder_converter; + cl_event_ = CLEngine::Instance()->CreateEvent(context); initialized_ = true; + DLOG << " end init cl image"; } cl_mem GetCLImage() const { return cl_image_.get(); } const DDim &ImageDims() const { return image_dims_; } - inline size_t ImageWidth() const { return image_width_; } - - inline size_t ImageHeight() const { return image_height_; } + inline size_t ImageWidth() const { return image_dims_[0]; } - /* - * block of channels, 4 channel one block - * */ - inline size_t CBlock() const { return c_block_; } - - /* - * width of original tensor - * */ - inline size_t WidthOfOneBlock() const { return width_of_one_block_; } - - /* - * height of original tensor - * */ - inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + inline size_t ImageHeight() const { return image_dims_[1]; } inline cl_command_queue CommandQueue() const { return command_queue_; } @@ -158,47 +166,11 @@ class CLImage { * */ const DDim &dims() const { return tensor_dims_; } - const ImageType GetImageType() const { return image_type_; } - cl_event GetClEvent() const { return cl_event_.get(); } - private: - ImageType image_type_ = Invalid; - void InitCLImage2C(cl_context context, cl_command_queue command_queue, - float *tensor_data, const DDim &dim) { - image_type_ = Folder; - command_queue_ = command_queue; - assert(dim.size() <= 2); - int tdim[2] = {1, 1}; - if (dim.size() == 1) { - tdim[1] = dim[0]; - } else { - tdim[0] = dim[0]; - tdim[1] = dim[1]; - } - int width = (tdim[1] + 3) / 4; - int height = tdim[0]; - - image_width_ = width; - image_height_ = height; - image_dims_ = make_ddim({width, height}); - width_of_one_block_ = width; - height_of_one_block_ = height; - c_block_ = 1; - - std::unique_ptr imageData{}; - if (tensor_data) { - imageData.reset(new half_t[width * height * 4]); - for (int h = 0; h < tdim[0]; h++) { - for (int w = 0; w < tdim[1]; w++) { - imageData[(h * width + w / 4) * 4 + (w % 4)] = - Float2Half(tensor_data[h * tdim[1] + w]); - } - } - } - InitCLImage(context, width, height, imageData.get()); - } + CLImageConverterBase *Converter() const { return image_converter_; } + private: void InitCLImage(cl_context context, int width, int height, void *data) { cl_image_format cf = {.image_channel_order = CL_RGBA, .image_channel_data_type = CL_HALF_FLOAT}; @@ -228,89 +200,16 @@ class CLImage { PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); } } - void InitCLImage(cl_context context, cl_command_queue command_queue, - float *tensor_data, const DDim &dim) { - image_type_ = Normal; - DLOG << " tensor dim: " << dim; - // NCHW -> [W * (C+3)/4, H * N] - tensor_dims_ = dim; - command_queue_ = command_queue; - if (tensor_data) { - tensor_data_ = tensor_data; - } - size_t new_dims[] = {1, 1, 1, 1}; - - for (int j = 0; j < dim.size(); ++j) { - new_dims[4 - dim.size() + j] = dim[j]; - } - - size_t N, C, H, W; - - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - - width_of_one_block_ = W; - height_of_one_block_ = H; - - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - image_width_ = width; - image_height_ = height; - image_dims_ = make_ddim({image_width_, image_height_}); - c_block_ = width / W; - - DLOG << " tensor dim " << tensor_dims_; - DLOG << " 赋值时: image width: " << image_width_; - DLOG << " 赋值时: image height: " << image_height_; - - std::unique_ptr imageData{}; - int count = 0; - if (tensor_data != nullptr) { - imageData.reset(new half_t[width * height * 4]); - float *p = tensor_data; - size_t i0 = 0; - for (int n = 0; n < N; n++) { - for (int c = 0; c < c_block_ * 4; c++) { - size_t i1 = i0 + (c / 4) * W; - for (int h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (int w = 0; w < W; w++) { - if (c < C) { - // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + - // (c % 4); - imageData[i2] = Float2Half(*p); - i2 += 4; - p++; - } else { - imageData[i2] = 0.0; - i2 += 4; - } - } - i1 += width; - } - } - i0 += width * H; - } - } - InitCLImage(context, width, height, imageData.get()); - } bool initialized_ = false; std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_; std::unique_ptr<_cl_event, CLEventDeleter> cl_event_; - size_t image_width_; - size_t width_of_one_block_; - size_t height_of_one_block_; - size_t image_height_; - size_t c_block_; DDim tensor_dims_; DDim image_dims_; float *tensor_data_ = nullptr; cl_context context_; cl_command_queue command_queue_; + CLImageConverterBase *image_converter_ = nullptr; }; void TensorToCLImage(Tensor *tensor, CLImage *image, diff --git a/src/framework/cl/cl_image_converter.cpp b/src/framework/cl/cl_image_converter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ad3aec5b86bff5ad0c63b4a274addf70ff0eb05c --- /dev/null +++ b/src/framework/cl/cl_image_converter.cpp @@ -0,0 +1,301 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/cl/cl_image_converter.h" + +namespace paddle_mobile { +namespace framework { + +const DDim &CLImageConverterDefault::InitImageDimInfoWith( + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + return make_ddim({width, height}); +} + +void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + + DDim in_image_dim = InitImageDimInfoWith(tensor_dim); + + DLOG << " tensor dim " << tensor_dim; + DLOG << " image dim " << in_image_dim; + + size_t width = in_image_dim[0]; + size_t height = in_image_dim[1]; + + int w_block = width / W; + + float *p = nchw; + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < w_block * 4; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + if (c < C) { + // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + image[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + image[i2] = 0.0; + i2 += 4; + } + } + i1 += width; + } + } + i0 += width * H; + } +} + +void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + + int width = image_dim[0]; + int height = image_dim[0]; + + float *p = tensor; + + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = Half2Float(image[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } +} + +const DDim &CLImageConverterFolder::InitImageDimInfoWith( + const DDim &tensor_dim) { + if (tensor_dim.size() <= 2) { + int tdim[2] = {1, 1}; + if (tensor_dim.size() == 1) { + tdim[1] = tensor_dim[0]; + } else { + tdim[0] = tensor_dim[0]; + tdim[1] = tensor_dim[1]; + } + int width = (tdim[1] + 3) / 4; + int height = tdim[0]; + + width_of_one_block_ = width; + height_of_one_block_ = height; + c_block_ = 1; + + return make_ddim({width, height}); + + } else { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + width_of_one_block_ = W; + height_of_one_block_ = H; + c_block_ = width / W; + + return make_ddim({width, height}); + } +} + +void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0, + "tensor dim is not support "); + + if (tensor_dim.size() > 2) { + CLImageConverterDefault default_converter; + default_converter.NCHWToImage(tensor, image, tensor_dim); + + } else { + int tdim[2] = {1, 1}; + if (tensor_dim.size() == 1) { + tdim[1] = tensor_dim[0]; + } else { + tdim[0] = tensor_dim[0]; + tdim[1] = tensor_dim[1]; + } + + DDim image_dim = InitImageDimInfoWith(tensor_dim); + int width = image_dim[0]; + + for (int h = 0; h < tdim[0]; h++) { + for (int w = 0; w < tdim[1]; w++) { + image[(h * width + w / 4) * 4 + (w % 4)] = + Float2Half(tensor[h * tdim[1] + w]); + } + } + } +} + +void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + if (tensor_dim.size() > 2) { + CLImageConverterDefault default_converter; + default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); + + } else { + int width = image_dim[0]; + int height = image_dim[1]; + int H, W; + + if (tensor_dim.size() == 2) { + H = tensor_dim[0]; + W = tensor_dim[1]; + } else if (tensor_dim.size() == 1) { + H = 1; + W = tensor_dim[0]; + } + float *p = tensor; + + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]); + } + } + } +} + +const DDim &CLImageConverterNWBlock::InitImageDimInfoWith( + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + size_t N, C, H, W; + N = tensor_dim[0]; + C = tensor_dim[1]; + H = tensor_dim[2]; + W = tensor_dim[3]; + size_t width = W * ((N + 3) / 4); + size_t height = C * H; + return make_ddim({width, height}); +} + +void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + auto image_dim = InitImageDimInfoWith(tensor_dim); + float *p = tensor; + int N = tensor_dim[0]; + int C = tensor_dim[1]; + int H = tensor_dim[2]; + int W = tensor_dim[3]; + int width = image_dim[0]; + int height = image_dim[1]; + int block = image_dim[0] / tensor_dim[3]; + + for (int n = 0; n < block * 4; n++) { + for (int c = 0; c < C; c++) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + int index = 4 * c * (width * H) + 4 * (n / 4) * H * W + h * W * 4 + + w * 4 + (n % 4); + if (n < N) { + image[index] = Float2Half(*p); + p++; + } else { + image[index] = 0.0; + } + if (index >= (width * height * 4)) { + DLOG << " index out of range "; + } + } + } + } + } + DLOG << " init done"; +} + +void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + float *p = tensor; + int N = tensor_dim[0]; + int C = tensor_dim[1]; + int H = tensor_dim[2]; + int W = tensor_dim[3]; + int width = image_dim[0]; + int height = image_dim[1]; + int block = image_dim[0] / tensor_dim[3]; + + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + int index = 4 * c * (width * H) + 4 * (n / 4) * H * W + h * W * 4 + + w * 4 + (n % 4); + *p = Half2Float(image[index]); + p++; + if (index >= (width * height * 4)) { + DLOG << " index out of range "; + } + } + } + } + } + DLOG << " init done"; +} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image_converter.h b/src/framework/cl/cl_image_converter.h new file mode 100644 index 0000000000000000000000000000000000000000..6b7318e1051302b4e7e428aade5848cce7d64ba1 --- /dev/null +++ b/src/framework/cl/cl_image_converter.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "framework/cl/cl_half.h" +#include "framework/ddim.h" + +namespace paddle_mobile { +namespace framework { + +class CLImageConverterBase { + public: + virtual void NCHWToImage(float *nchw, half_t *image, + const DDim &tensor_dim) = 0; + + virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim, + const DDim &tensor_dim) = 0; + virtual const DDim &InitImageDimInfoWith(const DDim &tensor_dim) = 0; +}; + +class CLImageConverterDefault : public CLImageConverterBase { + public: + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; + +class CLImageConverterFolder : public CLImageConverterBase { + public: + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); + + /* + * width of original tensor + * */ + inline size_t WidthOfOneBlock() const { return width_of_one_block_; } + + /* + * height of original tensor + * */ + inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + + int GetCBlock() const { return c_block_; } + + private: + int c_block_; + int width_of_one_block_; + int height_of_one_block_; +}; + +class CLImageConverterNWBlock : public CLImageConverterBase { + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_scope.h b/src/framework/cl/cl_scope.h index 77c55e0acd244d9091cc1d88f5733796a5a1b50f..0965b133e6d8270b7cd6e28c8ed9a33739b2e2a8 100644 --- a/src/framework/cl/cl_scope.h +++ b/src/framework/cl/cl_scope.h @@ -38,12 +38,14 @@ class CLScope { std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel( const std::string &kernel_name, const std::string &file_name) { + DLOG << " to get program " << file_name; auto program = Program(file_name); - DLOG << " get program ~ "; + DLOG << " end get program ~ "; + DLOG << " to create kernel: " << kernel_name; std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel( clCreateKernel(program, kernel_name.c_str(), &status_)); CL_CHECK_ERRORS(status_); - DLOG << " create kernel ~ "; + DLOG << " end create kernel ~ "; return std::move(kernel); } diff --git a/src/framework/cl/cl_tensor.h b/src/framework/cl/cl_tensor.h index 57155f65c4491c1f1f3c533bd86339ae3ebf2964..b853fa0e8d734c38de2fdc53f766d735dc72bb20 100644 --- a/src/framework/cl/cl_tensor.h +++ b/src/framework/cl/cl_tensor.h @@ -115,9 +115,7 @@ class CLTensor : TensorBase { return reinterpret_cast(host_ptr_); } - int memorySize() { - return holder_->size(); - } + int memorySize() { return holder_->size(); } ~CLTensor() { DLOG << "~CLTensor"; diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index d2ba4d97362f51d7fe22be0b2a1ddd7cf7563c4b..57428407f67eb3b58372033eee8077ec6f078ee2 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -429,7 +429,6 @@ std::shared_ptr Executor::Predict( } #endif - auto last_op = ops.rbegin(); auto output_map = (*last_op)->Outputs(); std::vector out_keys = (*last_op)->GetOutKeys(); diff --git a/src/operators/kernel/arm/fetch_kernel.cpp b/src/operators/kernel/arm/fetch_kernel.cpp index ce5b34818a23550f977dc6bd5d3bbace05321fe3..62d0e678891e4f54471f95de08242a3e72f7a385 100644 --- a/src/operators/kernel/arm/fetch_kernel.cpp +++ b/src/operators/kernel/arm/fetch_kernel.cpp @@ -23,4 +23,4 @@ void FetchKernel::Compute(const FetchParam ¶m) { template class FetchKernel; } // namespace operators } // namespace paddle_mobile -#endif \ No newline at end of file +#endif diff --git a/src/operators/kernel/cl/batchnorm_kernel.cpp b/src/operators/kernel/cl/batchnorm_kernel.cpp index 10add5de5d5b271389671d28bfd3bcd2deaa3c8a..0d5695cb80736dcc126ce5f726c0a2566884fe45 100644 --- a/src/operators/kernel/cl/batchnorm_kernel.cpp +++ b/src/operators/kernel/cl/batchnorm_kernel.cpp @@ -76,18 +76,16 @@ void BatchNormKernel::Compute( auto out = param.OutputY()->GetCLImage(); auto new_scale = param.NewScale()->GetCLImage(); auto new_bias = param.NewBias()->GetCLImage(); - const int out_height = param.OutputY()->HeightOfOneBlock(); - const int out_width = param.OutputY()->WidthOfOneBlock(); + const int out_width = default_work_size[1]; - clSetKernelArg(kernel, 0, sizeof(int), &out_height); clSetKernelArg(kernel, 1, sizeof(int), &out_width); clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale); clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias); clSetKernelArg(kernel, 5, sizeof(cl_mem), &out); -// cl_event out_event = param.OutputY()->GetClEvent(); -// cl_event wait_event = param.InputX()->GetClEvent(); + // cl_event out_event = param.OutputY()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); } diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index f252cddf47e8ea53947b317773fbcba32fd1c5d7..29f1b151994d60f0f4c0956201bcb7747e144347 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -37,8 +37,7 @@ bool ConvAddBNReluKernel::Init( if (filter_ddim[1] == 1) { param->Filter()->Resize(ddim); } - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); + param->Bias()->InitCLImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); @@ -135,19 +134,25 @@ bool ConvAddBNReluKernel::Init( param->SetOffset(offset); - if (param->Filter()->WidthOfOneBlock() == 1 && - param->Filter()->HeightOfOneBlock() == 1) { + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); DLOG << " conv add bn relu conv 1x1"; } else if (param->Filter()->dims()[0] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && param->Filter()->dims()[2] == 3) { - // this->cl_helper_.AddKernel("depth_conv_3x3", - // "conv_add_bn_relu_kernel.cl"); - this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_add_bn_relu_kernel.cl"); + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); DLOG << " conv add bn relu depth_conv_3x3"; - } else if (param->Filter()->WidthOfOneBlock() == 3 && - param->Filter()->HeightOfOneBlock() == 3) { + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); DLOG << " conv add bn relu conv_3x3"; } else { @@ -173,12 +178,14 @@ void ConvAddBNReluKernel::Compute( auto output = param.Output()->GetCLImage(); int stride = param.Strides()[0]; int offset = param.Offset(); - int input_c = param.Input()->CBlock(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); int dilation = param.Dilations()[0]; - int input_width = param.Input()->WidthOfOneBlock(); - int input_height = param.Input()->HeightOfOneBlock(); - int output_width = param.Output()->WidthOfOneBlock(); - int output_height = param.Output()->HeightOfOneBlock(); + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; // DLOG << " c block " << c_block; // DLOG << " w " << w; diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index cb093451a5acbe480028d8518f665127faa02f1c..d8064fd50e1e9bdf19418bb60541f541fec9fcfc 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -25,8 +25,6 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Paddings()[0] == param->Paddings()[1], "need equal"); - param->Filter()->InitCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); param->Bias()->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue()); @@ -34,14 +32,24 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { static_cast(param->Paddings()[1]); param->SetOffset(offset); - if (param->Filter()->WidthOfOneBlock() == 1 && - param->Filter()->HeightOfOneBlock() == 1) { + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl"); } else if (param->Filter()->dims()[1] == 1) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl"); - } else if (param->Filter()->WidthOfOneBlock() == 3 && - param->Filter()->HeightOfOneBlock() == 3) { + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl"); + } else { PADDLE_MOBILE_THROW_EXCEPTION(" not support "); } @@ -63,12 +71,15 @@ void ConvAddKernel::Compute( auto output = param.Output()->GetCLImage(); int stride = param.Strides()[0]; int offset = param.Offset(); - int input_c = param.Input()->CBlock(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); int dilation = param.Dilations()[0]; - int input_width = param.Input()->WidthOfOneBlock(); - int input_height = param.Input()->HeightOfOneBlock(); - int output_width = param.Output()->WidthOfOneBlock(); - int output_height = param.Output()->HeightOfOneBlock(); + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; cl_int status; @@ -117,12 +128,12 @@ void ConvAddKernel::Compute( status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); CL_CHECK_ERRORS(status); -// cl_event out_event = param.Output()->GetClEvent(); -// cl_event wait_event = param.Input()->GetClEvent(); + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); } diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index 4c9820ae71c3e8dd7a78f701b1bf348d615d4aa1..05cefadce052fb65664cc797c800ec67e43f3a2c 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -29,7 +29,7 @@ bool ConvKernel::Init(ConvParam *param) { auto filter_ddim = param->Filter()->dims(); std::vector filter_shape( - {filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]}); + {filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]}); framework::DDim ddim = framework::make_ddim(filter_shape); if (filter_ddim[1] == 1) { param->Filter()->Resize(ddim); @@ -44,12 +44,11 @@ bool ConvKernel::Init(ConvParam *param) { DLOG << " init helper: " << &cl_helper_; DLOG << " conv kernel add kernel ~ "; - DLOG << " width of one block: " << param->Filter()->WidthOfOneBlock(); - DLOG << " height of one block: " << param->Filter()->HeightOfOneBlock(); + DLOG << " width of one block: " << param->Filter()->dims()[3]; + DLOG << " height of one block: " << param->Filter()->dims()[2]; DLOG << " filter dims: " << param->Filter()->dims(); - if (param->Filter()->WidthOfOneBlock() == 1 && - param->Filter()->HeightOfOneBlock() == 1) { + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { DLOG << " here1 "; this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl"); @@ -59,8 +58,8 @@ bool ConvKernel::Init(ConvParam *param) { DLOG << " here2 "; this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl"); - } else if (param->Filter()->WidthOfOneBlock() == 3 && - param->Filter()->HeightOfOneBlock() == 3) { + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { DLOG << " here3 "; this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl"); @@ -84,13 +83,15 @@ void ConvKernel::Compute(const ConvParam ¶m) { int stride = param.Strides()[0]; int offset = param.Offset(); - int input_c = param.Input()->CBlock(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); int dilation = param.Dilations()[0]; - int input_width = param.Input()->WidthOfOneBlock(); - int input_height = param.Input()->HeightOfOneBlock(); - int output_width = param.Output()->WidthOfOneBlock(); - int output_height = param.Output()->HeightOfOneBlock(); + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; cl_int status; @@ -122,13 +123,12 @@ void ConvKernel::Compute(const ConvParam ¶m) { status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); -// cl_event out_event = param.Output()->GetClEvent(); -// cl_event wait_event = param.Input()->GetClEvent(); - - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); } diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp index d4a539ab0a75c7ec04141de4a8619613712a1e52..35813a31f570c8daf956e4c90d0f3e3de1675eb4 100644 --- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -50,12 +50,15 @@ void DepthwiseConvKernel::Compute( auto output = param.Output()->GetCLImage(); int stride = param.Strides()[0]; int offset = param.Offset(); - int input_c = param.Input()->CBlock(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); int dilation = param.Dilations()[0]; - int input_width = param.Input()->WidthOfOneBlock(); - int input_height = param.Input()->HeightOfOneBlock(); - int output_width = param.Output()->WidthOfOneBlock(); - int output_height = param.Output()->HeightOfOneBlock(); + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; cl_int status; @@ -76,12 +79,12 @@ void DepthwiseConvKernel::Compute( CL_CHECK_ERRORS(status); -// cl_event out_event = param.Output()->GetClEvent(); -// cl_event wait_event = param.Input()->GetClEvent(); + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); } diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp index 886b535bbc655139d3d6fcc87fa2900c435ad744..ad5fb9cdbcd00dad56579297c010c3912e3dca24 100644 --- a/src/operators/kernel/cl/feed_kernel.cpp +++ b/src/operators/kernel/cl/feed_kernel.cpp @@ -30,7 +30,7 @@ void FeedKernel::Compute(const FeedParam ¶m) { cl_int status; auto output = param.Out(); const Tensor *input = param.InputX(); -// DLOG << *input; + // DLOG << *input; const float *input_data = input->data(); int numel = input->numel(); cl_mem cl_image = output->GetCLImage(); @@ -52,7 +52,7 @@ void FeedKernel::Compute(const FeedParam ¶m) { size_t global_work_size[2] = {width, height}; -// cl_event out_event = param.Out()->GetClEvent(); + // cl_event out_event = param.Out()->GetClEvent(); status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp index 06646debd43b6cc4ecf9b33f9c96e70f2289258a..31c1d4179cbdfc8145d90bee2353be821e65b40b 100644 --- a/src/operators/kernel/cl/fetch_kernel.cpp +++ b/src/operators/kernel/cl/fetch_kernel.cpp @@ -14,8 +14,8 @@ limitations under the License. */ #include "operators/kernel/fetch_kernel.h" #include "framework/cl/cl_tensor.h" -//#include "common/common.h" -//#include +// #include "common/common.h" +// #include namespace paddle_mobile { namespace operators { @@ -75,22 +75,22 @@ void FetchKernel::Compute(const FetchParam ¶m) { clSetKernelArg(kernel, 6, sizeof(int), &size_batch); } -// cl_event wait_event = param.InpdutX()->GetClEvent(); + // cl_event wait_event = param.InpdutX()->GetClEvent(); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); -// auto time1 = paddle_mobile::time(); + // auto time1 = paddle_mobile::time(); -// printf(" before finish \n"); -// clFlsh(this->cl_helper_.CLCommandQueue()); + // printf(" before finish \n"); + // clFlsh(this->cl_helper_.CLCommandQueue()); clFinish(this->cl_helper_.CLCommandQueue()); -// printf(" after finish \n"); + // printf(" after finish \n"); -// auto time2 = paddle_mobile::time(); -// -// -// std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2) -// << "ms" << std::endl; + // auto time2 = paddle_mobile::time(); + // + // + // std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2) + // << "ms" << std::endl; memcpy(out->data(), out_cl_tensor.Data(), out->memory_size()); } diff --git a/src/operators/kernel/cl/pool_kernel.cpp b/src/operators/kernel/cl/pool_kernel.cpp index 3159152944fe6259e47528f30d86b4b1a1f373b0..df79ababadd4c1b959a1eb0fe237a45ab97a6bd8 100644 --- a/src/operators/kernel/cl/pool_kernel.cpp +++ b/src/operators/kernel/cl/pool_kernel.cpp @@ -34,10 +34,17 @@ void PoolKernel::Compute(const PoolParam ¶m) { auto input = param.Input()->GetCLImage(); auto out = param.Output()->GetCLImage(); - const int in_height = param.Input()->HeightOfOneBlock(); - const int in_width = param.Input()->WidthOfOneBlock(); - const int out_height = param.Output()->HeightOfOneBlock(); - const int out_width = param.Output()->WidthOfOneBlock(); + framework::CLImageConverterFolder *input_folder_converter = + reinterpret_cast( + param.Input()->Converter()); + framework::CLImageConverterFolder *output_folder_converter = + reinterpret_cast( + param.Output()->Converter()); + + const int in_height = input_folder_converter->HeightOfOneBlock(); + const int in_width = input_folder_converter->WidthOfOneBlock(); + const int out_height = output_folder_converter->HeightOfOneBlock(); + const int out_width = output_folder_converter->WidthOfOneBlock(); std::string pooling_type = param.PoolingType(); std::vector ksize = param.Ksize(); @@ -63,8 +70,8 @@ void PoolKernel::Compute(const PoolParam ¶m) { clSetKernelArg(kernel, 10, sizeof(cl_mem), &input); clSetKernelArg(kernel, 11, sizeof(cl_mem), &out); -// cl_event out_event = param.Output()->GetClEvent(); -// cl_event wait_event = param.Input()->GetClEvent(); + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); } diff --git a/src/operators/kernel/cl/relu_kernel.cpp b/src/operators/kernel/cl/relu_kernel.cpp index 70c939b86df1fcbd161249638771256f68a34024..c3acfe442201a9be59c6f0a0a536cf9aea68c4a2 100644 --- a/src/operators/kernel/cl/relu_kernel.cpp +++ b/src/operators/kernel/cl/relu_kernel.cpp @@ -21,41 +21,41 @@ namespace operators { template <> bool ReluKernel::Init(ReluParam* param) { this->cl_helper_.AddKernel("relu", "relu.cl"); -// this->cl_helper_.AddKernel("relu_p0", "relu.cl"); -// this->cl_helper_.AddKernel("relu_p1", "relu.cl"); -// const auto dim = -// const_cast(param->InputX())->ImageDims(); -// param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(), -// this->cl_helper_.CLCommandQueue(), dim); + // this->cl_helper_.AddKernel("relu_p0", "relu.cl"); + // this->cl_helper_.AddKernel("relu_p1", "relu.cl"); + // const auto dim = + // const_cast(param->InputX())->ImageDims(); + // param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(), + // this->cl_helper_.CLCommandQueue(), + // dim); return true; } template <> void ReluKernel::Compute(const ReluParam& param) { auto kernel = this->cl_helper_.KernelAt(0); -// auto kernel_p0 = this->cl_helper_.KernelAt(1); -// auto kernel_p1 = this->cl_helper_.KernelAt(2); + // auto kernel_p0 = this->cl_helper_.KernelAt(1); + // auto kernel_p1 = this->cl_helper_.KernelAt(2); const auto* input = param.InputX(); auto* output = param.Out(); auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); auto inputImage = input->GetCLImage(); auto outputImage = output->GetCLImage(); -// auto tImage = -// const_cast&>(param).getMidImage().GetCLImage(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); -// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage); -// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage); -// clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage); -// clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage); - const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; + // auto tImage = + // const_cast&>(param).getMidImage().GetCLImage(); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + // clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage); + // clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage); + // clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage); + // clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage); + const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; -// cl_event out_event = param.Out()->GetClEvent(); -// cl_event wait_event = param.InputX()->GetClEvent(); + // cl_event out_event = param.Out()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, - work_size, NULL, 0, NULL, NULL); + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, + work_size, NULL, 0, NULL, NULL); // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3, // NULL, // work_size, NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp index a1e6b57984b7e8a206d2f452edaf7100260e274c..fb3aa9b52f722b21cdc30e54eafadf9dffcfef7a 100644 --- a/src/operators/kernel/cl/reshape_kernel.cpp +++ b/src/operators/kernel/cl/reshape_kernel.cpp @@ -55,8 +55,8 @@ void ReshapeKernel::Compute(const ReshapeParam ¶m) { clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]); const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()}; -// cl_event out_event = param.Out()->GetClEvent(); -// cl_event wait_event = param.InputX()->GetClEvent(); + // cl_event out_event = param.Out()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp index 432ead67fc87523a3d33ed83eff6ffe8c4666f97..22e6672ee462b963476dc72895329a9117fc16a8 100644 --- a/src/operators/kernel/cl/softmax_kernel.cpp +++ b/src/operators/kernel/cl/softmax_kernel.cpp @@ -42,27 +42,27 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); status = clSetKernelArg(kernel, 2, sizeof(int), &group); -// const auto &inputDim = input->dims(); -// -// int dims[4] = {1, 1, 1, 1}; -// -// for (int i = 0; i < inputDim.size(); i++) { -// dims[4 - inputDim.size() + i] = inputDim[i]; -// } -// -// clSetKernelArg(kernel, 2, sizeof(int), &dims); -// clSetKernelArg(kernel, 3, sizeof(int), &dims[1]); -// clSetKernelArg(kernel, 4, sizeof(int), &dims[2]); -// clSetKernelArg(kernel, 5, sizeof(int), &dims[3]); - -// cl_event out_event = param.Out()->GetClEvent(); -// cl_event wait_event = param.InputX()->GetClEvent(); - - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + // const auto &inputDim = input->dims(); + // + // int dims[4] = {1, 1, 1, 1}; + // + // for (int i = 0; i < inputDim.size(); i++) { + // dims[4 - inputDim.size() + i] = inputDim[i]; + // } + // + // clSetKernelArg(kernel, 2, sizeof(int), &dims); + // clSetKernelArg(kernel, 3, sizeof(int), &dims[1]); + // clSetKernelArg(kernel, 4, sizeof(int), &dims[2]); + // clSetKernelArg(kernel, 5, sizeof(int), &dims[3]); + + // cl_event out_event = param.Out()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - } template class SoftmaxKernel; diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index 402b187f8f5e9d2fbb70fa6bcfb72c88aa53e3d3..4748227f1e872edb8adb1a49db1abda8094df6f7 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -1465,7 +1465,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu) { #if __ARM_NEON - //#ifdef _OPENMP + // #ifdef _OPENMP // const float *newscale_data = new_scale->data(); // const float *newbias_data = new_bias->data(); // @@ -1645,7 +1645,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, // } // } // - //#else + // #else const float *input_data = input->data(); const float *filter_data = filter->data(); @@ -1877,7 +1877,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, input_data += inhxw * c; output_data += outhxw * c; } -//#endif +// #endif #endif } diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp index a6ad5a5e9622cd67ac542f8d1a7c56419145d07a..fa8564be1515d0498ea4040da7e9712debe20cba 100644 --- a/test/net/test_mobilenet_GPU.cpp +++ b/test/net/test_mobilenet_GPU.cpp @@ -33,23 +33,27 @@ int main() { std::vector dims{1, 3, 224, 224}; GetInput(g_test_image_1x3x224x224_banana, &input, dims); + std::vector vec_result; + // = paddle_mobile.Predict(input, dims); + auto time3 = paddle_mobile::time(); - auto vec_result = paddle_mobile.Predict(input, dims); + int max = 1; + for (int i = 0; i < max; ++i) { + vec_result = paddle_mobile.Predict(input, dims); + } auto time4 = paddle_mobile::time(); -// for (int i = 0; i < 10; ++i) { -// auto vec_result = paddle_mobile.Predict(input, dims); -// } - + // auto time3 = paddle_mobile::time(); -// auto time3 = paddle_mobile::time(); + // for (int i = 0; i < 10; ++i) { + // auto vec_result = paddle_mobile.Predict(input, dims); + // } -// for (int i = 0; i < 10; ++i) { -// auto vec_result = paddle_mobile.Predict(input, dims); -// } + // auto time4 = paddle_mobile::time(); -// auto time4 = paddle_mobile::time(); - std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) << "ms" << std::endl; + std::cout << "predict cost :" + << paddle_mobile::time_diff(time3, time4) / max << "ms" + << std::endl; std::vector::iterator biggest = std::max_element(std::begin(vec_result), std::end(vec_result)); std::cout << " Max element is " << *biggest << " at position "