未验证 提交 69695ecf 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1070 from codeWorm2015/opencl

 commit cl code
......@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once
#include <chrono>
namespace paddle_mobile {
using Time = decltype(std::chrono::high_resolution_clock::now());
inline Time time() { return std::chrono::high_resolution_clock::now(); }
......@@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) {
ms counter = std::chrono::duration_cast<ms>(diff);
return counter.count() / 1000.0;
}
}
......@@ -18,8 +18,8 @@ limitations under the License. */
#include <string>
#include "CL/cl.h"
#include "common/log.h"
#include "common/enforce.h"
#include "common/log.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_tool.h"
......
......@@ -14,110 +14,107 @@ limitations under the License. */
#include "cl_image.h"
namespace paddle_mobile {
namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,cl_command_queue commandQueue){
DDim ddim = cl_image->dims();
size_t N,C,H,W;
if(ddim.size()==4){
N = ddim[0];
if(N<0){
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
}else if(ddim.size()==1){
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_int err;
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0,0,0};
size_t region[3] = {width,height,1};
err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1<<2) + c % 4;
for (int w = 0; w < W; w++) {
*p = half2float(imageData[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
if (err != CL_SUCCESS) {
// TODO: error handling
}
namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
cl_command_queue commandQueue) {
DDim ddim = cl_image->dims();
size_t N, C, H, W;
if (ddim.size() == 4) {
N = ddim[0];
if (N < 0) {
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
} else if (ddim.size() == 1) {
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_int err;
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = half2float(imageData[i2]);
i2 += 4;
p++;
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,cl_command_queue commandQueue){
DDim ddim = cl_image->dims();
size_t N,C,H,W;
if(ddim.size()==4){
N = ddim[0];
if(N<0){
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
}else if(ddim.size()==1){
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
const float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0,0,0};
size_t region[3] = {width,height,1};
cl_int err;
err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
if (err != CL_SUCCESS) {
// TODO: error handling
}
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1<<2) + c % 4;
for (int w = 0; w < W; w++) {
imageData[i2] = float2half(*p);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
i1 += width;
}
}
i0 += width * H;
}
if (err != CL_SUCCESS) {
// TODO: error handling
}
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
cl_command_queue commandQueue) {
DDim ddim = cl_image->dims();
size_t N, C, H, W;
if (ddim.size() == 4) {
N = ddim[0];
if (N < 0) {
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
} else if (ddim.size() == 1) {
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
const float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
cl_int err;
err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
if (err != CL_SUCCESS) {
// TODO: error handling
}
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
imageData[i2] = float2half(*p);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
}
} // namespace framework
} // namespace paddle_mobile
......@@ -28,8 +28,93 @@ class CLImage {
public:
CLImage() = default;
void Init(cl_context context, float *tensorInput, DDim ddim) {
tensor_dims_ = ddim;
/*
* will not hold input tensor data, memcpy in this method
* */
void SetTensorData(float *tensorData, const DDim &dim) {
int numel = product(dim);
if (tensor_data_ != nullptr) {
delete[](tensor_data_);
}
tensor_data_ = new float[numel];
memcpy(tensor_data_, tensorData, numel);
tensor_dims_ = dim;
}
/*
* need call SetTensorData first
* */
void InitCLImage(cl_context context) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
InitCLImage(context, tensor_data_, tensor_dims_);
delete[](tensor_data_);
tensor_data_ = nullptr;
initialized_ = true;
}
void InitEmptyImage(cl_context context, const DDim &dim) {
if (tensor_data_ != nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(
" empty image tensor data shouldn't have value");
}
InitCLImage(context, nullptr, dim);
initialized_ = true;
}
cl_mem GetCLImage() const { return cl_image_; }
const DDim &ImageDims() { return image_dims_; }
inline size_t ImageWidth() const { return image_width_; }
inline size_t ImageHeight() const { return image_height_; }
/*
* block of channels, 4 channel one block
* */
inline size_t CBlock() const { return c_block_; }
/*
* width of original tensor
* */
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
/*
* height of original tensor
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
/*
* resize original tensor dim
* */
inline CLImage &Resize(const DDim &dims) {
tensor_dims_ = dims;
return *this;
}
template <typename T>
T *data() const {
if (initialized_) {
PADDLE_MOBILE_THROW_EXCEPTION(
" cl image has initialized, tensor data has been deleted ");
}
return reinterpret_cast<T *>(tensor_data_);
}
/*
* numel of tensor dim
* */
inline int64_t numel() const { return product(tensor_dims_); }
/*
* original tensor dim
* */
const DDim &dims() const { return tensor_dims_; }
private:
void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) {
cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT};
// NCHW -> [W * (C+3)/4, H * N]
......@@ -62,12 +147,13 @@ class CLImage {
image_width_ = width;
image_height_ = height;
image_dims_ = make_ddim({image_width_, image_height_});
std::unique_ptr<half_t[]> imageData{};
int count = 0;
if (tensorInput != nullptr) {
if (tensor_data != nullptr) {
imageData.reset(new half_t[width * height * 4]);
float *p = tensorInput;
float *p = tensor_data;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
......@@ -108,39 +194,8 @@ class CLImage {
// TODO(HaiPeng): error handling
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
}
initialized_ = true;
}
void Init(cl_context context, DDim ddim) { Init(context, nullptr, ddim); }
inline CLImage &Resize(const DDim &dims) {
tensor_dims_ = dims;
return *this;
}
const DDim &dims() const { return tensor_dims_; }
cl_mem GetCLImage() const { return cl_image_; }
template <typename T>
T *data() const {
return reinterpret_cast<T *>(tensor_input_);
}
inline int64_t numel() const { return product(tensor_dims_); }
inline size_t ImageWidth() const { return image_width_; }
inline size_t ImageHeight() const { return image_height_; }
inline size_t CBlock() const { return c_block_; }
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
private:
bool initialized_ = false;
cl_mem cl_image_;
size_t image_width_;
......@@ -149,7 +204,8 @@ class CLImage {
size_t image_height_;
size_t c_block_;
DDim tensor_dims_;
float *tensor_input_;
DDim image_dims_;
float *tensor_data_;
cl_context context_;
};
......
......@@ -56,7 +56,8 @@ class CLScope {
auto program = CLEngine::Instance()->CreateProgramWith(
context_.get(), "./cl_kernel/" + file_name);
status_ = clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
status_ =
clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
CL_CHECK_ERRORS(status_);
programs_[file_name] = std::move(program);
......
......@@ -21,12 +21,13 @@ namespace framework {
const char* opencl_error_to_str(cl_int error);
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"OpenCL error with code %s happened in file %s at line %d. " \
"Exiting.\n", \
opencl_error_to_str(ERR), __FILE__, __LINE__); \
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"OpenCL error with code %s happened in file %s at line %d. " \
"Exiting.\n", \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
}
} // namespace framework
......
......@@ -928,7 +928,8 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->Init(context, tensorInput, ddim);
// has not init
cl_image->SetTensorData(tensorInput, ddim);
delete origin_data;
paddle_mobile::memory::Free(tensorInput);
......@@ -941,7 +942,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
// framework::DDim ddim = framework::make_ddim(desc.Dims());
framework::DDim ddim = cl_image->dims();
DLOG << var_desc->Name();
cl_image->Init(context, ddim);
cl_image->InitEmptyImage(context, ddim);
}
}
}
......@@ -982,7 +983,10 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
float *tensorInput = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * numel));
LoadMemory(*var_desc, tensorInput, &origin_data);
cl_image->Init(context, tensorInput, ddim);
// has not init
cl_image->SetTensorData(tensorInput, ddim);
paddle_mobile::memory::Free(tensorInput);
} else {
auto cl_image = var->template GetMutable<framework::CLImage>();
......@@ -991,8 +995,7 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = cl_image->dims();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->Init(context, ddim);
cl_image->InitEmptyImage(context, ddim);
}
}
}
......
......@@ -98,8 +98,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
void Init() {}
void RunImpl() {
param_.Out()->ShareDataWith(*param_.InputX());
param_.Out()->set_lod(param_.InputX()->lod());
param_.Out()->ShareDataWith(*param_.InputX());
param_.Out()->set_lod(param_.InputX()->lod());
}
protected:
......
......@@ -18,9 +18,10 @@ limitations under the License. */
inline hafl4 activation(half4 in
#ifdef PRELU
,half4 prelu_alpha
,
half4 prelu_alpha
#endif
) {
) {
half4 output;
#ifdef PRELU
output = select(prelu_alpha * in, in, in >= (half4)0.0);
......@@ -31,4 +32,3 @@ inline hafl4 activation(half4 in
#endif
return output;
}
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "operators/kernel/conv_add_bn_relu_kernel.h"
#include "framework/cl/cl_image.h"
#include "framework/cl/cl_tool.h"
namespace paddle_mobile {
namespace operators {
......@@ -56,15 +57,15 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
framework::CLImage *new_scale = new framework::CLImage();
new_scale->Init(this->cl_helper_.CLContext(), new_scale_ptr,
variance->dims());
new_scale->SetTensorData(new_scale_ptr, variance->dims());
new_scale->InitCLImage(this->cl_helper_.CLContext());
framework::CLImage *new_bias = new framework::CLImage();
new_bias->Init(this->cl_helper_.CLContext(), new_bias_ptr, variance->dims());
new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext());
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
PADDLE_MOBILE_ENFORCE(
......@@ -115,26 +116,32 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &c_block);
clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 2, sizeof(int), &nh);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 9, sizeof(int), &stride);
clSetKernelArg(kernel, 10, sizeof(int), &offset);
clSetKernelArg(kernel, 11, sizeof(int), &input_c);
clSetKernelArg(kernel, 12, sizeof(int), &dilation);
clSetKernelArg(kernel, 13, sizeof(int), &input_width);
clSetKernelArg(kernel, 14, sizeof(int), &input_height);
clSetKernelArg(kernel, 15, sizeof(int), &output_width);
clSetKernelArg(kernel, 16, sizeof(int), &output_height);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class ConvAddBNReluKernel<GPU_CL, float>;
......
......@@ -65,24 +65,31 @@ void ConvAddKernel<GPU_CL, float>::Compute(
int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &c_block);
clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 2, sizeof(int), &nh);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 7, sizeof(int), &stride);
clSetKernelArg(kernel, 8, sizeof(int), &offset);
clSetKernelArg(kernel, 9, sizeof(int), &input_c);
clSetKernelArg(kernel, 10, sizeof(int), &dilation);
clSetKernelArg(kernel, 11, sizeof(int), &input_width);
clSetKernelArg(kernel, 12, sizeof(int), &input_height);
clSetKernelArg(kernel, 13, sizeof(int), &output_width);
clSetKernelArg(kernel, 14, sizeof(int), &output_height);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class ConvAddKernel<GPU_CL, float>;
......
......@@ -21,63 +21,69 @@ namespace operators {
template <>
bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
// PADDLE_MOBILE_ENFORCE(
// param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
// param->Paddings()[0] == param->Paddings()[1],
// "need equal");
// int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
// static_cast<int>(param->Paddings()[1]);
// param->SetOffset(offset);
//
// if (param->Filter()->WidthOfOneBlock() == 1 &&
// param->Filter()->HeightOfOneBlock() == 1) {
// this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
// } else if (param->Filter()->dims()[1] == 1) {
// this->cl_helper_.AddKernel("depth_conv_3x3",
// "conv_add_bn_relu_kernel.cl");
// } else if (param->Filter()->WidthOfOneBlock() == 3 &&
// param->Filter()->HeightOfOneBlock() == 3) {
// this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
// } else {
// PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
// }
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) {
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) {
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
}
return true;
}
template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
// auto kernel = this->cl_helper_.KernelAt(0);
// auto default_work_size =
// this->cl_helper_.DefaultWorkSize(*param.Output()); int c_block =
// default_work_size[0]; int w = default_work_size[1]; int nh =
// default_work_size[2]; auto input = param.Input()->GetCLImage(); auto
// filter = param.Filter()->GetCLImage(); auto output = param.Output(); int
// stride = param.Strides()[0]; int offset = param.Offset(); int input_c =
// param.Input()->CBlock(); int dilation = param.Dilations()[0]; int
// input_width = param.Input()->WidthOfOneBlock(); int input_height =
// param.Input()->HeightOfOneBlock();
//
// clSetKernelArg(kernel, 0, sizeof(int), &c_block);
// clSetKernelArg(kernel, 1, sizeof(int), &w);
// clSetKernelArg(kernel, 2, sizeof(int), &nh);
// clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
// clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
// clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
// clSetKernelArg(kernel, 6, sizeof(int), &stride);
// clSetKernelArg(kernel, 7, sizeof(int), &offset);
// clSetKernelArg(kernel, 8, sizeof(int), &input_c);
// clSetKernelArg(kernel, 9, sizeof(int), &dilation);
// clSetKernelArg(kernel, 10, sizeof(int), &input_width);
// clSetKernelArg(kernel, 11, sizeof(int), &input_height);
//
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL);
// auto kernel = this->cl_helper_.KernelAt(0);
// size_t global_work_size[3] = {1, 2, 3};
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// global_work_size, NULL, 0, NULL, NULL);
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class ConvKernel<GPU_CL, float>;
......
......@@ -24,9 +24,9 @@ template <>
bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
DLOG << " depthwise conv kernel init begin ";
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
"need equal");
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
......@@ -36,7 +36,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
}
template <>
void DepthwiseConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
void DepthwiseConvKernel<GPU_CL, float>::Compute(
const ConvParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
......@@ -54,23 +55,30 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param)
int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &c_block);
clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 2, sizeof(int), &nh);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 6, sizeof(int), &stride);
clSetKernelArg(kernel, 7, sizeof(int), &offset);
clSetKernelArg(kernel, 8, sizeof(int), &input_c);
clSetKernelArg(kernel, 9, sizeof(int), &dilation);
clSetKernelArg(kernel, 10, sizeof(int), &input_width);
clSetKernelArg(kernel, 11, sizeof(int), &input_height);
clSetKernelArg(kernel, 12, sizeof(int), &output_width);
clSetKernelArg(kernel, 13, sizeof(int), &output_height);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class DepthwiseConvKernel<GPU_CL, float>;
......@@ -78,4 +86,4 @@ template class DepthwiseConvKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
\ No newline at end of file
#endif
......@@ -12,42 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "common/log.h"
#include "operators/kernel/feed_kernel.h"
#include "common/log.h"
namespace paddle_mobile {
namespace operators {
template <>
bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
DLOG<<"Init feed";
this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
return true;
}
template <>
void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
DLOG<<"feed_kernel";
auto kernel = this->cl_helper_.KernelAt(0);
cl_int status;
auto output = param.Out();
auto input = param.InputX();
const float *input_data = input->data<float>();
cl_mem cl_image = output->GetCLImage();
int height = output->dims()[2];
int width = output->dims()[3];
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&input_data);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&cl_image);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&width);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&height);
size_t global_work_size[2] = {height,width};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
}
template class FeedKernel<GPU_CL, float>;
} // namespace operators
namespace operators {
template <>
bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
DLOG << "Init feed";
this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
return true;
}
template <>
void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
DLOG << "feed_kernel";
auto kernel = this->cl_helper_.KernelAt(0);
cl_int status;
auto output = param.Out();
auto input = param.InputX();
DLOG << " input: " << input;
const float *input_data = input->data<float>();
cl_mem cl_image = output->GetCLImage();
int height = output->dims()[2];
int width = output->dims()[3];
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height);
size_t global_work_size[2] = {height, width};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
global_work_size, NULL, 0, NULL, NULL);
}
template class FeedKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
......@@ -19,13 +19,13 @@ namespace paddle_mobile {
namespace operators {
template <>
bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL> *param) {
bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
this->cl_helper_.AddKernel("relu", "relu.cl");
return true;
}
template <>
void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL> &param) {
void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
auto kernel = this->cl_helper_.KernelAt(0);
const auto* input = param.InputX();
auto* output = param.Out();
......@@ -34,7 +34,7 @@ void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL> &param) {
auto outputImage = output->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
const size_t work_size[2] = { input->ImageWidth(), input->ImageHeight() };
const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
work_size, NULL, 0, NULL, NULL);
}
......@@ -43,4 +43,4 @@ template class ReluKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
\ No newline at end of file
#endif
......@@ -25,30 +25,29 @@ bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
template <>
void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
const auto * input = param.InputX();
auto * output = param.Out();
auto kernel = this->cl_helper_.KernelAt(0);
const auto *input = param.InputX();
auto *output = param.Out();
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
const auto & inputDim = input->dims();
const auto & outputDim = output->dims();
const auto &inputDim = input->dims();
const auto &outputDim = output->dims();
int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]};
int odims[4] = {outputDim[0], outputDim[1], outputDim[2], outputDim[3]};
clSetKernelArg(kernel, 2, sizeof(int), dims);
clSetKernelArg(kernel, 3, sizeof(int), dims+1);
clSetKernelArg(kernel, 4, sizeof(int), dims+2);
clSetKernelArg(kernel, 5, sizeof(int), dims+3);
clSetKernelArg(kernel, 3, sizeof(int), dims + 1);
clSetKernelArg(kernel, 4, sizeof(int), dims + 2);
clSetKernelArg(kernel, 5, sizeof(int), dims + 3);
clSetKernelArg(kernel, 6, sizeof(int), odims);
clSetKernelArg(kernel, 7, sizeof(int), odims+1);
clSetKernelArg(kernel, 8, sizeof(int), odims+2);
clSetKernelArg(kernel, 9, sizeof(int), odims+3);
const size_t work_size[2] = { output->ImageWidth(), output->ImageHeight() };
clSetKernelArg(kernel, 7, sizeof(int), odims + 1);
clSetKernelArg(kernel, 8, sizeof(int), odims + 2);
clSetKernelArg(kernel, 9, sizeof(int), odims + 3);
const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
work_size, NULL, 0, NULL, NULL);
}
template class ReshapeKernel<GPU_CL, float>;
......
......@@ -29,18 +29,18 @@ template <>
void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
const auto * input = param.InputX();
auto * output = param.Out();
const auto *input = param.InputX();
auto *output = param.Out();
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
const auto & inputDim = input->dims();
const auto &inputDim = input->dims();
int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]};
clSetKernelArg(kernel, 2, sizeof(int), dims);
clSetKernelArg(kernel, 3, sizeof(int), dims+1);
clSetKernelArg(kernel, 4, sizeof(int), dims+2);
clSetKernelArg(kernel, 5, sizeof(int), dims+3);
clSetKernelArg(kernel, 3, sizeof(int), dims + 1);
clSetKernelArg(kernel, 4, sizeof(int), dims + 2);
clSetKernelArg(kernel, 5, sizeof(int), dims + 3);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
......
......@@ -18,15 +18,15 @@ limitations under the License. */
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using namespace framework;
template <typename DeviceType, typename T>
class FeedKernel
: public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>>{
public:
void Compute(const FeedParam<DeviceType> &param);
bool Init(FeedParam<DeviceType> *param);
};
namespace operators {
using namespace framework;
template <typename DeviceType, typename T>
class FeedKernel
: public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
public:
void Compute(const FeedParam<DeviceType> &param);
bool Init(FeedParam<DeviceType> *param);
};
} // namespace operators
} // namespace operators
} // namespace paddle_mobile
......@@ -29,8 +29,8 @@ int main() {
bool optimize = true;
auto time1 = time();
if (paddle_mobile.Load(g_googlenet, optimize)) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
auto time2 = paddle_mobile::time();
std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
......
......@@ -19,14 +19,14 @@ limitations under the License. */
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
// paddle_mobile.SetThreadNum(4);
auto time1 = time();
auto time1 = paddle_mobile::time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_mobilenet, false);
if (isok) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
auto time2 = paddle_mobile::time();
std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
......@@ -42,13 +42,13 @@ int main() {
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time();
auto time3 = paddle_mobile::time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
DLOG << vec_result;
auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
auto time4 = paddle_mobile::time();
std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 << "ms"
<< std::endl;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册