提交 4001ca6f 编写于 作者: L liuruilong

update opencl code

上级 83737c34
cmake_minimum_required(VERSION 3.6)
option(USE_OPENMP "openmp support" OFF)
project(paddle-mobile)
option(USE_OPENMP "openmp support" OFF)
option(DEBUGING "enable debug mode" ON)
option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" OFF)
......@@ -12,6 +10,8 @@ option(GPU_MALI "mali gpu" OFF)
option(GPU_CL "opencl gpu" ON)
option(FPGA "fpga" OFF)
project(paddle-mobile)
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
include_directories(src/)
......
......@@ -90,7 +90,8 @@ class CLEngine {
bool BuildProgram(cl_program program) {
cl_int status;
status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel", 0, 0);
status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel",
0, 0);
CL_CHECK_ERRORS(status);
......@@ -98,7 +99,7 @@ class CLEngine {
size_t log_size;
clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *)malloc(log_size);
char *log = reinterpret_cast<char *>(malloc(log_size));
clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
DLOG << " program build error: " << log;
......
......@@ -49,9 +49,6 @@ class CLHelper {
cl_context CLContext() { return scope_->Context(); }
std::vector<size_t> DefaultWorkSize(const CLImage &image) {
if (image.GetImageType() == Invalid) {
PADDLE_MOBILE_THROW_EXCEPTION(" not support image type");
}
// n c h w
auto image_dim = image.dims();
if (image_dim.size() == 4) {
......@@ -66,7 +63,7 @@ class CLHelper {
} else if (image_dim.size() == 2) {
return {1, image.ImageWidth(), image.ImageHeight()};
} else if (image_dim.size() == 1) {
return {1, image.ImageWidth(), 1};
return {1, image.ImageWidth(), 1};
}
PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
}
......
......@@ -16,214 +16,47 @@ limitations under the License. */
namespace paddle_mobile {
namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
cl_command_queue commandQueue) {
DDim ddim = cl_image->dims();
size_t N, C, H, W;
if (ddim.size() == 4) {
N = ddim[0];
if (N < 0) {
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
} else if (ddim.size() == 1) {
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
float *p = tensor->mutable_data<float>();
half imageData[width * height * 4];
cl_int err;
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = Half2Float(imageData[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
if (err != CL_SUCCESS) {
CL_CHECK_ERRORS(err);
}
// TODO(yangfei): need imp
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
cl_command_queue commandQueue) {
DDim ddim = cl_image->dims();
size_t N, C, H, W;
if (ddim.size() == 4) {
N = ddim[0];
if (N < 0) {
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
} else if (ddim.size() == 1) {
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
const float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
cl_int err;
err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
if (err != CL_SUCCESS) {
CL_CHECK_ERRORS(err);
}
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
imageData[i2] = Float2Half(*p);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
// TODO(yangfei): need imp
}
#ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &cl_image) {
if (cl_image.GetImageType() == Invalid) {
PADDLE_MOBILE_THROW_EXCEPTION(" not support image type");
}
printer << " dims: " << cl_image.dims() << "\n";
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
float *data = new float[cl_image.numel()];
DDim ddim = cl_image.dims();
size_t N, C, H, W, width, height;
if (cl_image.GetImageType() == Normal || cl_image.dims().size() == 3 ||
cl_image.dims().size() == 4) {
if (ddim.size() == 4) {
N = ddim[0];
if (N < 0) {
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
width = W * ((C + 3) / 4);
height = N * H;
} else if (ddim.size() == 2) {
width = ddim[1];
height = ddim[0];
N = 1;
C = 1;
H = ddim[0];
W = ddim[1];
} else if (ddim.size() == 1) {
width = ddim[0];
height = 1;
N = 1;
C = 1;
H = 1;
W = ddim[0];
}
float *p = data;
half *imageData = new half[height * width * 4];
cl_int err;
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, imageData, 0, NULL, NULL);
if (err != CL_SUCCESS) {
printf("ImageWidth %ld \n", cl_image.ImageWidth());
printf("ImageWidth %ld \n", cl_image.ImageHeight());
}
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = Half2Float(imageData[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
delete (imageData);
CL_CHECK_ERRORS(err);
int width = cl_image.ImageDims()[0];
int height = cl_image.ImageDims()[1];
} else {
if (ddim.size() == 2) {
width = (ddim[1] + 3) / 4;
height = ddim[0];
H = ddim[0];
W = ddim[1];
half_t *image_data = new half_t[height * width * 4];
cl_int err;
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, image_data, 0, NULL, NULL);
} else if (ddim.size() == 1) {
width = (ddim[0] + 3) / 4;
height = 1;
H = 1;
W = ddim[0];
}
float *p = data;
half *imageData = new half[width * height * 4];
cl_int err;
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, imageData, 0, NULL, NULL);
for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) {
p[h * W + w] = Half2Float(imageData[(h * width + w / 4) * 4 + (w % 4)]);
}
}
CL_CHECK_ERRORS(err);
delete (imageData);
CL_CHECK_ERRORS(err);
}
float *tensor_data = new float[cl_image.numel()];
auto converter = cl_image.Converter();
converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
cl_image.dims());
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
printer << " dims: " << cl_image.dims() << "\n";
for (int i = 0; i < cl_image.numel(); i += stride) {
printer << data[i] << " ";
printer << tensor_data[i] << " ";
}
delete (data);
delete[](tensor_data);
delete[](image_data);
return printer;
}
#endif
......
......@@ -18,22 +18,30 @@ limitations under the License. */
#include "CL/cl.h"
#include "framework/cl/cl_half.h"
#include "framework/cl/cl_tool.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/cl/cl_half.h"
#include "framework/cl/cl_image_converter.h"
#include "framework/cl/cl_tool.h"
#include "framework/ddim.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace framework {
enum ImageType { Invalid = -1, Normal = 0, Folder = 1 };
class CLImage {
public:
CLImage() = default;
~CLImage() {
if (tensor_data_ != nullptr) {
delete[](tensor_data_);
}
if (image_converter_) {
delete (image_converter_);
}
}
/*
* will not hold input tensor data, memcpy in this method
* */
......@@ -54,79 +62,79 @@ class CLImage {
* folder when one dim or two dim
* */
void InitCLImage(cl_context context, cl_command_queue command_queue) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
DLOG << tensor_dims_;
if (tensor_dims_.size() <= 2) {
DLOG << " dim <= 2 folder ~~~~~ ";
InitCLImage2C(context, command_queue, tensor_data_, tensor_dims_);
} else {
DLOG << " dim > 2 norm ~~~~~ ";
InitCLImage(context, command_queue, tensor_data_, tensor_dims_);
PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
" need call SetTensorData first");
CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
InitCLImage(context, command_queue, folder_converter);
}
void InitCLImage(cl_context context, cl_command_queue command_queue,
CLImageConverterBase *converter) {
if (image_converter_ != nullptr) {
delete (image_converter_);
}
PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
" need call SetTensorData first");
DLOG << " begin init cl image ";
image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
half_t *image_data = new half_t[product(image_dims_) * 4];
DLOG << " convert to image";
converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
DLOG << " end convert to image";
InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
delete[](image_data);
delete[](tensor_data_);
command_queue_ = command_queue;
tensor_data_ = nullptr;
image_converter_ = converter;
initialized_ = true;
DLOG << " end init cl image";
}
/*
* need call SetTensorData first
* */
void InitCLImageNormal(cl_context context, cl_command_queue command_queue) {
void InitNImage(cl_context context, cl_command_queue command_queue) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
InitCLImage(context, command_queue, tensor_data_, tensor_dims_);
delete[](tensor_data_);
tensor_data_ = nullptr;
initialized_ = true;
CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock();
InitCLImage(context, command_queue, folder_converter);
PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
}
void InitEmptyImage(cl_context context, cl_command_queue command_queue,
const DDim &dim) {
if (tensor_data_ != nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(
" empty image tensor data shouldn't have value");
}
DLOG << " init empty image ";
if (tensor_dims_.size() <= 2) {
DLOG << " dim <= 2 folder ~~~~~ ";
InitCLImage2C(context, command_queue, tensor_data_, tensor_dims_);
} else {
DLOG << " dim > 2 norm ~~~~~ ";
InitCLImage(context, command_queue, tensor_data_, tensor_dims_);
}
PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
" empty image tensor data shouldn't have value");
cl_event_ = CLEngine::Instance()->CreateEvent(context);
CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
DLOG << " to get image dims ";
image_dims_ = folder_converter->InitImageDimInfoWith(dim);
DLOG << " end get image dims " << image_dims_;
InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
// InitCLImage(context, command_queue, nullptr, dim);
tensor_dims_ = dim;
command_queue_ = command_queue;
image_converter_ = folder_converter;
cl_event_ = CLEngine::Instance()->CreateEvent(context);
initialized_ = true;
DLOG << " end init cl image";
}
cl_mem GetCLImage() const { return cl_image_.get(); }
const DDim &ImageDims() const { return image_dims_; }
inline size_t ImageWidth() const { return image_width_; }
inline size_t ImageHeight() const { return image_height_; }
inline size_t ImageWidth() const { return image_dims_[0]; }
/*
* block of channels, 4 channel one block
* */
inline size_t CBlock() const { return c_block_; }
/*
* width of original tensor
* */
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
/*
* height of original tensor
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
inline size_t ImageHeight() const { return image_dims_[1]; }
inline cl_command_queue CommandQueue() const { return command_queue_; }
......@@ -158,47 +166,11 @@ class CLImage {
* */
const DDim &dims() const { return tensor_dims_; }
const ImageType GetImageType() const { return image_type_; }
cl_event GetClEvent() const { return cl_event_.get(); }
private:
ImageType image_type_ = Invalid;
void InitCLImage2C(cl_context context, cl_command_queue command_queue,
float *tensor_data, const DDim &dim) {
image_type_ = Folder;
command_queue_ = command_queue;
assert(dim.size() <= 2);
int tdim[2] = {1, 1};
if (dim.size() == 1) {
tdim[1] = dim[0];
} else {
tdim[0] = dim[0];
tdim[1] = dim[1];
}
int width = (tdim[1] + 3) / 4;
int height = tdim[0];
image_width_ = width;
image_height_ = height;
image_dims_ = make_ddim({width, height});
width_of_one_block_ = width;
height_of_one_block_ = height;
c_block_ = 1;
std::unique_ptr<half_t[]> imageData{};
if (tensor_data) {
imageData.reset(new half_t[width * height * 4]);
for (int h = 0; h < tdim[0]; h++) {
for (int w = 0; w < tdim[1]; w++) {
imageData[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor_data[h * tdim[1] + w]);
}
}
}
InitCLImage(context, width, height, imageData.get());
}
CLImageConverterBase *Converter() const { return image_converter_; }
private:
void InitCLImage(cl_context context, int width, int height, void *data) {
cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT};
......@@ -228,89 +200,16 @@ class CLImage {
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
}
}
void InitCLImage(cl_context context, cl_command_queue command_queue,
float *tensor_data, const DDim &dim) {
image_type_ = Normal;
DLOG << " tensor dim: " << dim;
// NCHW -> [W * (C+3)/4, H * N]
tensor_dims_ = dim;
command_queue_ = command_queue;
if (tensor_data) {
tensor_data_ = tensor_data;
}
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < dim.size(); ++j) {
new_dims[4 - dim.size() + j] = dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
width_of_one_block_ = W;
height_of_one_block_ = H;
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
image_width_ = width;
image_height_ = height;
image_dims_ = make_ddim({image_width_, image_height_});
c_block_ = width / W;
DLOG << " tensor dim " << tensor_dims_;
DLOG << " 赋值时: image width: " << image_width_;
DLOG << " 赋值时: image height: " << image_height_;
std::unique_ptr<half_t[]> imageData{};
int count = 0;
if (tensor_data != nullptr) {
imageData.reset(new half_t[width * height * 4]);
float *p = tensor_data;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < c_block_ * 4; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
if (c < C) {
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
imageData[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
imageData[i2] = 0.0;
i2 += 4;
}
}
i1 += width;
}
}
i0 += width * H;
}
}
InitCLImage(context, width, height, imageData.get());
}
bool initialized_ = false;
std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
size_t image_width_;
size_t width_of_one_block_;
size_t height_of_one_block_;
size_t image_height_;
size_t c_block_;
DDim tensor_dims_;
DDim image_dims_;
float *tensor_data_ = nullptr;
cl_context context_;
cl_command_queue command_queue_;
CLImageConverterBase *image_converter_ = nullptr;
};
void TensorToCLImage(Tensor *tensor, CLImage *image,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_image_converter.h"
namespace paddle_mobile {
namespace framework {
const DDim &CLImageConverterDefault::InitImageDimInfoWith(
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
return make_ddim({width, height});
}
void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
DLOG << " tensor dim " << tensor_dim;
DLOG << " image dim " << in_image_dim;
size_t width = in_image_dim[0];
size_t height = in_image_dim[1];
int w_block = width / W;
float *p = nchw;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < w_block * 4; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
if (c < C) {
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
image[i2] = 0.0;
i2 += 4;
}
}
i1 += width;
}
}
i0 += width * H;
}
}
void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
int width = image_dim[0];
int height = image_dim[0];
float *p = tensor;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = Half2Float(image[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
}
const DDim &CLImageConverterFolder::InitImageDimInfoWith(
const DDim &tensor_dim) {
if (tensor_dim.size() <= 2) {
int tdim[2] = {1, 1};
if (tensor_dim.size() == 1) {
tdim[1] = tensor_dim[0];
} else {
tdim[0] = tensor_dim[0];
tdim[1] = tensor_dim[1];
}
int width = (tdim[1] + 3) / 4;
int height = tdim[0];
width_of_one_block_ = width;
height_of_one_block_ = height;
c_block_ = 1;
return make_ddim({width, height});
} else {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
width_of_one_block_ = W;
height_of_one_block_ = H;
c_block_ = width / W;
return make_ddim({width, height});
}
}
void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
"tensor dim is not support ");
if (tensor_dim.size() > 2) {
CLImageConverterDefault default_converter;
default_converter.NCHWToImage(tensor, image, tensor_dim);
} else {
int tdim[2] = {1, 1};
if (tensor_dim.size() == 1) {
tdim[1] = tensor_dim[0];
} else {
tdim[0] = tensor_dim[0];
tdim[1] = tensor_dim[1];
}
DDim image_dim = InitImageDimInfoWith(tensor_dim);
int width = image_dim[0];
for (int h = 0; h < tdim[0]; h++) {
for (int w = 0; w < tdim[1]; w++) {
image[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor[h * tdim[1] + w]);
}
}
}
}
void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
if (tensor_dim.size() > 2) {
CLImageConverterDefault default_converter;
default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
} else {
int width = image_dim[0];
int height = image_dim[1];
int H, W;
if (tensor_dim.size() == 2) {
H = tensor_dim[0];
W = tensor_dim[1];
} else if (tensor_dim.size() == 1) {
H = 1;
W = tensor_dim[0];
}
float *p = tensor;
for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) {
p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
}
}
}
}
const DDim &CLImageConverterNWBlock::InitImageDimInfoWith(
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
size_t N, C, H, W;
N = tensor_dim[0];
C = tensor_dim[1];
H = tensor_dim[2];
W = tensor_dim[3];
size_t width = W * ((N + 3) / 4);
size_t height = C * H;
return make_ddim({width, height});
}
void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
auto image_dim = InitImageDimInfoWith(tensor_dim);
float *p = tensor;
int N = tensor_dim[0];
int C = tensor_dim[1];
int H = tensor_dim[2];
int W = tensor_dim[3];
int width = image_dim[0];
int height = image_dim[1];
int block = image_dim[0] / tensor_dim[3];
for (int n = 0; n < block * 4; n++) {
for (int c = 0; c < C; c++) {
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
int index = 4 * c * (width * H) + 4 * (n / 4) * H * W + h * W * 4 +
w * 4 + (n % 4);
if (n < N) {
image[index] = Float2Half(*p);
p++;
} else {
image[index] = 0.0;
}
if (index >= (width * height * 4)) {
DLOG << " index out of range ";
}
}
}
}
}
DLOG << " init done";
}
void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
float *p = tensor;
int N = tensor_dim[0];
int C = tensor_dim[1];
int H = tensor_dim[2];
int W = tensor_dim[3];
int width = image_dim[0];
int height = image_dim[1];
int block = image_dim[0] / tensor_dim[3];
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
int index = 4 * c * (width * H) + 4 * (n / 4) * H * W + h * W * 4 +
w * 4 + (n % 4);
*p = Half2Float(image[index]);
p++;
if (index >= (width * height * 4)) {
DLOG << " index out of range ";
}
}
}
}
}
DLOG << " init done";
}
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/cl/cl_half.h"
#include "framework/ddim.h"
namespace paddle_mobile {
namespace framework {
class CLImageConverterBase {
public:
virtual void NCHWToImage(float *nchw, half_t *image,
const DDim &tensor_dim) = 0;
virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
const DDim &tensor_dim) = 0;
virtual const DDim &InitImageDimInfoWith(const DDim &tensor_dim) = 0;
};
class CLImageConverterDefault : public CLImageConverterBase {
public:
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterFolder : public CLImageConverterBase {
public:
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
/*
* width of original tensor
* */
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
/*
* height of original tensor
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
int GetCBlock() const { return c_block_; }
private:
int c_block_;
int width_of_one_block_;
int height_of_one_block_;
};
class CLImageConverterNWBlock : public CLImageConverterBase {
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
} // namespace framework
} // namespace paddle_mobile
......@@ -38,12 +38,14 @@ class CLScope {
std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
const std::string &kernel_name, const std::string &file_name) {
DLOG << " to get program " << file_name;
auto program = Program(file_name);
DLOG << " get program ~ ";
DLOG << " end get program ~ ";
DLOG << " to create kernel: " << kernel_name;
std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
clCreateKernel(program, kernel_name.c_str(), &status_));
CL_CHECK_ERRORS(status_);
DLOG << " create kernel ~ ";
DLOG << " end create kernel ~ ";
return std::move(kernel);
}
......
......@@ -115,9 +115,7 @@ class CLTensor : TensorBase {
return reinterpret_cast<T *>(host_ptr_);
}
int memorySize() {
return holder_->size();
}
int memorySize() { return holder_->size(); }
~CLTensor() {
DLOG << "~CLTensor";
......
......@@ -429,7 +429,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
}
#endif
auto last_op = ops.rbegin();
auto output_map = (*last_op)->Outputs();
std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
......
......@@ -23,4 +23,4 @@ void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) {
template class FetchKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
\ No newline at end of file
#endif
......@@ -76,18 +76,16 @@ void BatchNormKernel<GPU_CL, float>::Compute(
auto out = param.OutputY()->GetCLImage();
auto new_scale = param.NewScale()->GetCLImage();
auto new_bias = param.NewBias()->GetCLImage();
const int out_height = param.OutputY()->HeightOfOneBlock();
const int out_width = param.OutputY()->WidthOfOneBlock();
const int out_width = default_work_size[1];
clSetKernelArg(kernel, 0, sizeof(int), &out_height);
clSetKernelArg(kernel, 1, sizeof(int), &out_width);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &out);
// cl_event out_event = param.OutputY()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
// cl_event out_event = param.OutputY()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
......
......@@ -37,8 +37,7 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
if (filter_ddim[1] == 1) {
param->Filter()->Resize(ddim);
}
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
......@@ -135,19 +134,25 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param->SetOffset(offset);
if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) {
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
DLOG << " conv add bn relu conv 1x1";
} else if (param->Filter()->dims()[0] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) {
// this->cl_helper_.AddKernel("depth_conv_3x3",
// "conv_add_bn_relu_kernel.cl");
this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_add_bn_relu_kernel.cl");
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
DLOG << " conv add bn relu depth_conv_3x3";
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
DLOG << " conv add bn relu conv_3x3";
} else {
......@@ -173,12 +178,14 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock();
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
// DLOG << " c block " << c_block;
// DLOG << " w " << w;
......
......@@ -25,8 +25,6 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
......@@ -34,14 +32,24 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) {
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) {
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
}
......@@ -63,12 +71,15 @@ void ConvAddKernel<GPU_CL, float>::Compute(
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock();
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
cl_int status;
......@@ -117,12 +128,12 @@ void ConvAddKernel<GPU_CL, float>::Compute(
status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
......
......@@ -29,7 +29,7 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
auto filter_ddim = param->Filter()->dims();
std::vector<int64_t> filter_shape(
{filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]});
{filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]});
framework::DDim ddim = framework::make_ddim(filter_shape);
if (filter_ddim[1] == 1) {
param->Filter()->Resize(ddim);
......@@ -44,12 +44,11 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
DLOG << " init helper: " << &cl_helper_;
DLOG << " conv kernel add kernel ~ ";
DLOG << " width of one block: " << param->Filter()->WidthOfOneBlock();
DLOG << " height of one block: " << param->Filter()->HeightOfOneBlock();
DLOG << " width of one block: " << param->Filter()->dims()[3];
DLOG << " height of one block: " << param->Filter()->dims()[2];
DLOG << " filter dims: " << param->Filter()->dims();
if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) {
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
DLOG << " here1 ";
this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl");
......@@ -59,8 +58,8 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
DLOG << " here2 ";
this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
DLOG << " here3 ";
this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
......@@ -84,13 +83,15 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock();
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
cl_int status;
......@@ -122,13 +123,12 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
......
......@@ -50,12 +50,15 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock();
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
cl_int status;
......@@ -76,12 +79,12 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS(status);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
......
......@@ -30,7 +30,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
cl_int status;
auto output = param.Out();
const Tensor *input = param.InputX();
// DLOG << *input;
// DLOG << *input;
const float *input_data = input->data<float>();
int numel = input->numel();
cl_mem cl_image = output->GetCLImage();
......@@ -52,7 +52,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
size_t global_work_size[2] = {width, height};
// cl_event out_event = param.Out()->GetClEvent();
// cl_event out_event = param.Out()->GetClEvent();
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
......
......@@ -14,8 +14,8 @@ limitations under the License. */
#include "operators/kernel/fetch_kernel.h"
#include "framework/cl/cl_tensor.h"
//#include "common/common.h"
//#include <iostream>
// #include "common/common.h"
// #include <iostream>
namespace paddle_mobile {
namespace operators {
......@@ -75,22 +75,22 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
}
// cl_event wait_event = param.InpdutX()->GetClEvent();
// cl_event wait_event = param.InpdutX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
// auto time1 = paddle_mobile::time();
// auto time1 = paddle_mobile::time();
// printf(" before finish \n");
// clFlsh(this->cl_helper_.CLCommandQueue());
// printf(" before finish \n");
// clFlsh(this->cl_helper_.CLCommandQueue());
clFinish(this->cl_helper_.CLCommandQueue());
// printf(" after finish \n");
// printf(" after finish \n");
// auto time2 = paddle_mobile::time();
//
//
// std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2)
// << "ms" << std::endl;
// auto time2 = paddle_mobile::time();
//
//
// std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2)
// << "ms" << std::endl;
memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
}
......
......@@ -34,10 +34,17 @@ void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {
auto input = param.Input()->GetCLImage();
auto out = param.Output()->GetCLImage();
const int in_height = param.Input()->HeightOfOneBlock();
const int in_width = param.Input()->WidthOfOneBlock();
const int out_height = param.Output()->HeightOfOneBlock();
const int out_width = param.Output()->WidthOfOneBlock();
framework::CLImageConverterFolder *input_folder_converter =
reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter());
framework::CLImageConverterFolder *output_folder_converter =
reinterpret_cast<framework::CLImageConverterFolder *>(
param.Output()->Converter());
const int in_height = input_folder_converter->HeightOfOneBlock();
const int in_width = input_folder_converter->WidthOfOneBlock();
const int out_height = output_folder_converter->HeightOfOneBlock();
const int out_width = output_folder_converter->WidthOfOneBlock();
std::string pooling_type = param.PoolingType();
std::vector<int> ksize = param.Ksize();
......@@ -63,8 +70,8 @@ void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {
clSetKernelArg(kernel, 10, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 11, sizeof(cl_mem), &out);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
......
......@@ -21,41 +21,41 @@ namespace operators {
template <>
bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
this->cl_helper_.AddKernel("relu", "relu.cl");
// this->cl_helper_.AddKernel("relu_p0", "relu.cl");
// this->cl_helper_.AddKernel("relu_p1", "relu.cl");
// const auto dim =
// const_cast<framework::CLImage*>(param->InputX())->ImageDims();
// param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(),
// this->cl_helper_.CLCommandQueue(), dim);
// this->cl_helper_.AddKernel("relu_p0", "relu.cl");
// this->cl_helper_.AddKernel("relu_p1", "relu.cl");
// const auto dim =
// const_cast<framework::CLImage*>(param->InputX())->ImageDims();
// param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(),
// this->cl_helper_.CLCommandQueue(),
// dim);
return true;
}
template <>
void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
auto kernel = this->cl_helper_.KernelAt(0);
// auto kernel_p0 = this->cl_helper_.KernelAt(1);
// auto kernel_p1 = this->cl_helper_.KernelAt(2);
// auto kernel_p0 = this->cl_helper_.KernelAt(1);
// auto kernel_p1 = this->cl_helper_.KernelAt(2);
const auto* input = param.InputX();
auto* output = param.Out();
auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
// auto tImage =
// const_cast<ReluParam<GPU_CL>&>(param).getMidImage().GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage);
// clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage);
// clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage);
const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
// auto tImage =
// const_cast<ReluParam<GPU_CL>&>(param).getMidImage().GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage);
// clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage);
// clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage);
const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL,
work_size, NULL, 0, NULL, NULL);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
work_size, NULL, 0, NULL, NULL);
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3,
// NULL,
// work_size, NULL, 0, NULL, NULL);
......
......@@ -55,8 +55,8 @@ void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]);
const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
work_size, NULL, 0, NULL, NULL);
......
......@@ -42,27 +42,27 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
status = clSetKernelArg(kernel, 2, sizeof(int), &group);
// const auto &inputDim = input->dims();
//
// int dims[4] = {1, 1, 1, 1};
//
// for (int i = 0; i < inputDim.size(); i++) {
// dims[4 - inputDim.size() + i] = inputDim[i];
// }
//
// clSetKernelArg(kernel, 2, sizeof(int), &dims);
// clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
// clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
// clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
// const auto &inputDim = input->dims();
//
// int dims[4] = {1, 1, 1, 1};
//
// for (int i = 0; i < inputDim.size(); i++) {
// dims[4 - inputDim.size() + i] = inputDim[i];
// }
//
// clSetKernelArg(kernel, 2, sizeof(int), &dims);
// clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
// clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
// clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class SoftmaxKernel<GPU_CL, float>;
......
......@@ -1465,7 +1465,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) {
#if __ARM_NEON
//#ifdef _OPENMP
// #ifdef _OPENMP
// const float *newscale_data = new_scale->data<float>();
// const float *newbias_data = new_bias->data<float>();
//
......@@ -1645,7 +1645,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
// }
// }
//
//#else
// #else
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
......@@ -1877,7 +1877,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
input_data += inhxw * c;
output_data += outhxw * c;
}
//#endif
// #endif
#endif
}
......
......@@ -33,23 +33,27 @@ int main() {
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
std::vector<float> vec_result;
// = paddle_mobile.Predict(input, dims);
auto time3 = paddle_mobile::time();
auto vec_result = paddle_mobile.Predict(input, dims);
int max = 1;
for (int i = 0; i < max; ++i) {
vec_result = paddle_mobile.Predict(input, dims);
}
auto time4 = paddle_mobile::time();
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// auto time3 = paddle_mobile::time();
// auto time3 = paddle_mobile::time();
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// auto time4 = paddle_mobile::time();
// auto time4 = paddle_mobile::time();
std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) << "ms" << std::endl;
std::cout << "predict cost :"
<< paddle_mobile::time_diff(time3, time4) / max << "ms"
<< std::endl;
std::vector<float>::iterator biggest =
std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册