提交 b054affa 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1070 from codeWorm2015/opencl

 commit cl code
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <chrono> #include <chrono>
namespace paddle_mobile {
using Time = decltype(std::chrono::high_resolution_clock::now()); using Time = decltype(std::chrono::high_resolution_clock::now());
inline Time time() { return std::chrono::high_resolution_clock::now(); } inline Time time() { return std::chrono::high_resolution_clock::now(); }
...@@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) { ...@@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) {
ms counter = std::chrono::duration_cast<ms>(diff); ms counter = std::chrono::duration_cast<ms>(diff);
return counter.count() / 1000.0; return counter.count() / 1000.0;
} }
}
...@@ -18,8 +18,8 @@ limitations under the License. */ ...@@ -18,8 +18,8 @@ limitations under the License. */
#include <string> #include <string>
#include "CL/cl.h" #include "CL/cl.h"
#include "common/log.h"
#include "common/enforce.h" #include "common/enforce.h"
#include "common/log.h"
#include "framework/cl/cl_deleter.h" #include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_tool.h" #include "framework/cl/cl_tool.h"
......
...@@ -14,110 +14,107 @@ limitations under the License. */ ...@@ -14,110 +14,107 @@ limitations under the License. */
#include "cl_image.h" #include "cl_image.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,cl_command_queue commandQueue){ void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
cl_command_queue commandQueue) {
DDim ddim = cl_image->dims(); DDim ddim = cl_image->dims();
size_t N,C,H,W; size_t N, C, H, W;
if(ddim.size()==4){ if (ddim.size() == 4) {
N = ddim[0]; N = ddim[0];
if(N<0){ if (N < 0) {
N = 1; N = 1;
} }
C = ddim[1]; C = ddim[1];
H = ddim[2]; H = ddim[2];
W = ddim[3]; W = ddim[3];
}else if(ddim.size()==1){ } else if (ddim.size() == 1) {
N = 1; N = 1;
C = ddim[0]; C = ddim[0];
H = 1; H = 1;
W = 1; W = 1;
} }
size_t width = W * ((C + 3) / 4); size_t width = W * ((C + 3) / 4);
size_t height = H * N; size_t height = H * N;
float *p = tensor->data<float>(); float *p = tensor->data<float>();
half imageData[width * height * 4]; half imageData[width * height * 4];
cl_int err; cl_int err;
cl_mem image = cl_image->GetCLImage(); cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0,0,0}; size_t origin[3] = {0, 0, 0};
size_t region[3] = {width,height,1}; size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL); err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
size_t i0 = 0; imageData, 0, NULL, NULL);
for (int n = 0; n < N; n++) { size_t i0 = 0;
for (int c = 0; c < C; c++) { for (int n = 0; n < N; n++) {
size_t i1 = i0; for (int c = 0; c < C; c++) {
for (int h = 0; h < H; h++) { size_t i1 = i0;
size_t i2 = (i1<<2) + c % 4; for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) { size_t i2 = (i1 << 2) + c % 4;
*p = half2float(imageData[i2]); for (int w = 0; w < W; w++) {
i2 += 4; *p = half2float(imageData[i2]);
p++; i2 += 4;
} p++;
i1 += width;
}
}
i0 += width * H;
}
if (err != CL_SUCCESS) {
// TODO: error handling
}
} }
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,cl_command_queue commandQueue){ i1 += width;
}
DDim ddim = cl_image->dims(); }
size_t N,C,H,W; i0 += width * H;
if(ddim.size()==4){ }
N = ddim[0];
if(N<0){
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
}else if(ddim.size()==1){
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
const float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0,0,0};
size_t region[3] = {width,height,1};
cl_int err;
err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
if (err != CL_SUCCESS) {
// TODO: error handling
}
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1<<2) + c % 4;
for (int w = 0; w < W; w++) {
imageData[i2] = float2half(*p);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
if (err != CL_SUCCESS) {
// TODO: error handling
}
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
cl_command_queue commandQueue) {
DDim ddim = cl_image->dims();
size_t N, C, H, W;
if (ddim.size() == 4) {
N = ddim[0];
if (N < 0) {
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
} else if (ddim.size() == 1) {
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
const float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
cl_int err;
err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
if (err != CL_SUCCESS) {
// TODO: error handling
}
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
imageData[i2] = float2half(*p);
i2 += 4;
p++;
} }
i1 += width;
}
} }
i0 += width * H;
}
} }
} // namespace framework
} // namespace paddle_mobile
...@@ -28,8 +28,93 @@ class CLImage { ...@@ -28,8 +28,93 @@ class CLImage {
public: public:
CLImage() = default; CLImage() = default;
void Init(cl_context context, float *tensorInput, DDim ddim) { /*
tensor_dims_ = ddim; * will not hold input tensor data, memcpy in this method
* */
void SetTensorData(float *tensorData, const DDim &dim) {
int numel = product(dim);
if (tensor_data_ != nullptr) {
delete[](tensor_data_);
}
tensor_data_ = new float[numel];
memcpy(tensor_data_, tensorData, numel);
tensor_dims_ = dim;
}
/*
* need call SetTensorData first
* */
void InitCLImage(cl_context context) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
InitCLImage(context, tensor_data_, tensor_dims_);
delete[](tensor_data_);
tensor_data_ = nullptr;
initialized_ = true;
}
void InitEmptyImage(cl_context context, const DDim &dim) {
if (tensor_data_ != nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(
" empty image tensor data shouldn't have value");
}
InitCLImage(context, nullptr, dim);
initialized_ = true;
}
cl_mem GetCLImage() const { return cl_image_; }
const DDim &ImageDims() { return image_dims_; }
inline size_t ImageWidth() const { return image_width_; }
inline size_t ImageHeight() const { return image_height_; }
/*
* block of channels, 4 channel one block
* */
inline size_t CBlock() const { return c_block_; }
/*
* width of original tensor
* */
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
/*
* height of original tensor
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
/*
* resize original tensor dim
* */
inline CLImage &Resize(const DDim &dims) {
tensor_dims_ = dims;
return *this;
}
template <typename T>
T *data() const {
if (initialized_) {
PADDLE_MOBILE_THROW_EXCEPTION(
" cl image has initialized, tensor data has been deleted ");
}
return reinterpret_cast<T *>(tensor_data_);
}
/*
* numel of tensor dim
* */
inline int64_t numel() const { return product(tensor_dims_); }
/*
* original tensor dim
* */
const DDim &dims() const { return tensor_dims_; }
private:
void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) {
cl_image_format cf = {.image_channel_order = CL_RGBA, cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT}; .image_channel_data_type = CL_HALF_FLOAT};
// NCHW -> [W * (C+3)/4, H * N] // NCHW -> [W * (C+3)/4, H * N]
...@@ -62,12 +147,13 @@ class CLImage { ...@@ -62,12 +147,13 @@ class CLImage {
image_width_ = width; image_width_ = width;
image_height_ = height; image_height_ = height;
image_dims_ = make_ddim({image_width_, image_height_});
std::unique_ptr<half_t[]> imageData{}; std::unique_ptr<half_t[]> imageData{};
int count = 0; int count = 0;
if (tensorInput != nullptr) { if (tensor_data != nullptr) {
imageData.reset(new half_t[width * height * 4]); imageData.reset(new half_t[width * height * 4]);
float *p = tensorInput; float *p = tensor_data;
size_t i0 = 0; size_t i0 = 0;
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) { for (int c = 0; c < C; c++) {
...@@ -108,39 +194,8 @@ class CLImage { ...@@ -108,39 +194,8 @@ class CLImage {
// TODO(HaiPeng): error handling // TODO(HaiPeng): error handling
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
} }
initialized_ = true;
}
void Init(cl_context context, DDim ddim) { Init(context, nullptr, ddim); }
inline CLImage &Resize(const DDim &dims) {
tensor_dims_ = dims;
return *this;
}
const DDim &dims() const { return tensor_dims_; }
cl_mem GetCLImage() const { return cl_image_; }
template <typename T>
T *data() const {
return reinterpret_cast<T *>(tensor_input_);
} }
inline int64_t numel() const { return product(tensor_dims_); }
inline size_t ImageWidth() const { return image_width_; }
inline size_t ImageHeight() const { return image_height_; }
inline size_t CBlock() const { return c_block_; }
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
private:
bool initialized_ = false; bool initialized_ = false;
cl_mem cl_image_; cl_mem cl_image_;
size_t image_width_; size_t image_width_;
...@@ -149,7 +204,8 @@ class CLImage { ...@@ -149,7 +204,8 @@ class CLImage {
size_t image_height_; size_t image_height_;
size_t c_block_; size_t c_block_;
DDim tensor_dims_; DDim tensor_dims_;
float *tensor_input_; DDim image_dims_;
float *tensor_data_;
cl_context context_; cl_context context_;
}; };
......
...@@ -56,7 +56,8 @@ class CLScope { ...@@ -56,7 +56,8 @@ class CLScope {
auto program = CLEngine::Instance()->CreateProgramWith( auto program = CLEngine::Instance()->CreateProgramWith(
context_.get(), "./cl_kernel/" + file_name); context_.get(), "./cl_kernel/" + file_name);
status_ = clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0); status_ =
clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
CL_CHECK_ERRORS(status_); CL_CHECK_ERRORS(status_);
programs_[file_name] = std::move(program); programs_[file_name] = std::move(program);
......
...@@ -21,12 +21,13 @@ namespace framework { ...@@ -21,12 +21,13 @@ namespace framework {
const char* opencl_error_to_str(cl_int error); const char* opencl_error_to_str(cl_int error);
#define CL_CHECK_ERRORS(ERR) \ #define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \ if (ERR != CL_SUCCESS) { \
printf( \ printf( \
"OpenCL error with code %s happened in file %s at line %d. " \ "OpenCL error with code %s happened in file %s at line %d. " \
"Exiting.\n", \ "Exiting.\n", \
opencl_error_to_str(ERR), __FILE__, __LINE__); \ paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
} }
} // namespace framework } // namespace framework
......
...@@ -928,7 +928,8 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() { ...@@ -928,7 +928,8 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
framework::DDim ddim = framework::make_ddim(desc.Dims()); framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->Init(context, tensorInput, ddim); // has not init
cl_image->SetTensorData(tensorInput, ddim);
delete origin_data; delete origin_data;
paddle_mobile::memory::Free(tensorInput); paddle_mobile::memory::Free(tensorInput);
...@@ -941,7 +942,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() { ...@@ -941,7 +942,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
// framework::DDim ddim = framework::make_ddim(desc.Dims()); // framework::DDim ddim = framework::make_ddim(desc.Dims());
framework::DDim ddim = cl_image->dims(); framework::DDim ddim = cl_image->dims();
DLOG << var_desc->Name(); DLOG << var_desc->Name();
cl_image->Init(context, ddim); cl_image->InitEmptyImage(context, ddim);
} }
} }
} }
...@@ -982,7 +983,10 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() { ...@@ -982,7 +983,10 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
float *tensorInput = static_cast<float *>( float *tensorInput = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * numel)); paddle_mobile::memory::Alloc(sizeof(float) * numel));
LoadMemory(*var_desc, tensorInput, &origin_data); LoadMemory(*var_desc, tensorInput, &origin_data);
cl_image->Init(context, tensorInput, ddim);
// has not init
cl_image->SetTensorData(tensorInput, ddim);
paddle_mobile::memory::Free(tensorInput); paddle_mobile::memory::Free(tensorInput);
} else { } else {
auto cl_image = var->template GetMutable<framework::CLImage>(); auto cl_image = var->template GetMutable<framework::CLImage>();
...@@ -991,8 +995,7 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() { ...@@ -991,8 +995,7 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
const framework::TensorDesc &desc = var_desc->Tensor_desc(); const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = cl_image->dims(); framework::DDim ddim = cl_image->dims();
// framework::DDim ddim = framework::make_ddim(desc.Dims()); // framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->InitEmptyImage(context, ddim);
cl_image->Init(context, ddim);
} }
} }
} }
......
...@@ -98,8 +98,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -98,8 +98,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
void Init() {} void Init() {}
void RunImpl() { void RunImpl() {
param_.Out()->ShareDataWith(*param_.InputX()); param_.Out()->ShareDataWith(*param_.InputX());
param_.Out()->set_lod(param_.InputX()->lod()); param_.Out()->set_lod(param_.InputX()->lod());
} }
protected: protected:
......
...@@ -18,9 +18,10 @@ limitations under the License. */ ...@@ -18,9 +18,10 @@ limitations under the License. */
inline hafl4 activation(half4 in inline hafl4 activation(half4 in
#ifdef PRELU #ifdef PRELU
,half4 prelu_alpha ,
half4 prelu_alpha
#endif #endif
) { ) {
half4 output; half4 output;
#ifdef PRELU #ifdef PRELU
output = select(prelu_alpha * in, in, in >= (half4)0.0); output = select(prelu_alpha * in, in, in >= (half4)0.0);
...@@ -31,4 +32,3 @@ inline hafl4 activation(half4 in ...@@ -31,4 +32,3 @@ inline hafl4 activation(half4 in
#endif #endif
return output; return output;
} }
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "operators/kernel/conv_add_bn_relu_kernel.h" #include "operators/kernel/conv_add_bn_relu_kernel.h"
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
#include "framework/cl/cl_tool.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -56,15 +57,15 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -56,15 +57,15 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
framework::CLImage *new_scale = new framework::CLImage(); framework::CLImage *new_scale = new framework::CLImage();
new_scale->Init(this->cl_helper_.CLContext(), new_scale_ptr, new_scale->SetTensorData(new_scale_ptr, variance->dims());
variance->dims()); new_scale->InitCLImage(this->cl_helper_.CLContext());
framework::CLImage *new_bias = new framework::CLImage(); framework::CLImage *new_bias = new framework::CLImage();
new_bias->Init(this->cl_helper_.CLContext(), new_bias_ptr, variance->dims()); new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext());
param->SetNewScale(new_scale); param->SetNewScale(new_scale);
param->SetNewBias(new_bias); param->SetNewBias(new_bias);
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
...@@ -115,26 +116,32 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -115,26 +116,32 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
int output_width = param.Output()->WidthOfOneBlock(); int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &c_block); cl_int status;
clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 2, sizeof(int), &nh); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); status = clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
clSetKernelArg(kernel, 9, sizeof(int), &stride); status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
clSetKernelArg(kernel, 10, sizeof(int), &offset); status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 11, sizeof(int), &input_c); status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
clSetKernelArg(kernel, 12, sizeof(int), &dilation); status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
clSetKernelArg(kernel, 13, sizeof(int), &input_width); status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
clSetKernelArg(kernel, 14, sizeof(int), &input_height); status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
clSetKernelArg(kernel, 15, sizeof(int), &output_width); status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
clSetKernelArg(kernel, 16, sizeof(int), &output_height); status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
template class ConvAddBNReluKernel<GPU_CL, float>; template class ConvAddBNReluKernel<GPU_CL, float>;
......
...@@ -65,24 +65,31 @@ void ConvAddKernel<GPU_CL, float>::Compute( ...@@ -65,24 +65,31 @@ void ConvAddKernel<GPU_CL, float>::Compute(
int output_width = param.Output()->WidthOfOneBlock(); int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &c_block); cl_int status;
clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 2, sizeof(int), &nh); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); status = clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
clSetKernelArg(kernel, 7, sizeof(int), &stride); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
clSetKernelArg(kernel, 8, sizeof(int), &offset); status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 9, sizeof(int), &input_c); status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
clSetKernelArg(kernel, 10, sizeof(int), &dilation); status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
clSetKernelArg(kernel, 11, sizeof(int), &input_width); status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
clSetKernelArg(kernel, 12, sizeof(int), &input_height); status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
clSetKernelArg(kernel, 13, sizeof(int), &output_width); status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
clSetKernelArg(kernel, 14, sizeof(int), &output_height); status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
template class ConvAddKernel<GPU_CL, float>; template class ConvAddKernel<GPU_CL, float>;
......
...@@ -21,63 +21,69 @@ namespace operators { ...@@ -21,63 +21,69 @@ namespace operators {
template <> template <>
bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
// PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
// param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
// param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
// "need equal"); "need equal");
// int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
// static_cast<int>(param->Paddings()[1]); int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
// param->SetOffset(offset); static_cast<int>(param->Paddings()[1]);
// param->SetOffset(offset);
// if (param->Filter()->WidthOfOneBlock() == 1 &&
// param->Filter()->HeightOfOneBlock() == 1) { if (param->Filter()->WidthOfOneBlock() == 1 &&
// this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); param->Filter()->HeightOfOneBlock() == 1) {
// } else if (param->Filter()->dims()[1] == 1) { this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
// this->cl_helper_.AddKernel("depth_conv_3x3", } else if (param->Filter()->dims()[1] == 1) {
// "conv_add_bn_relu_kernel.cl"); this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
// } else if (param->Filter()->WidthOfOneBlock() == 3 && } else if (param->Filter()->WidthOfOneBlock() == 3 &&
// param->Filter()->HeightOfOneBlock() == 3) { param->Filter()->HeightOfOneBlock() == 3) {
// this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
// } else { } else {
// PADDLE_MOBILE_THROW_EXCEPTION(" not support "); PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
// } }
return true; return true;
} }
template <> template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
// auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
// auto default_work_size = auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
// this->cl_helper_.DefaultWorkSize(*param.Output()); int c_block = int c_block = default_work_size[0];
// default_work_size[0]; int w = default_work_size[1]; int nh = int w = default_work_size[1];
// default_work_size[2]; auto input = param.Input()->GetCLImage(); auto int nh = default_work_size[2];
// filter = param.Filter()->GetCLImage(); auto output = param.Output(); int auto input = param.Input()->GetCLImage();
// stride = param.Strides()[0]; int offset = param.Offset(); int input_c = auto filter = param.Filter()->GetCLImage();
// param.Input()->CBlock(); int dilation = param.Dilations()[0]; int auto output = param.Output();
// input_width = param.Input()->WidthOfOneBlock(); int input_height = int stride = param.Strides()[0];
// param.Input()->HeightOfOneBlock(); int offset = param.Offset();
// int input_c = param.Input()->CBlock();
// clSetKernelArg(kernel, 0, sizeof(int), &c_block); int dilation = param.Dilations()[0];
// clSetKernelArg(kernel, 1, sizeof(int), &w); int input_width = param.Input()->WidthOfOneBlock();
// clSetKernelArg(kernel, 2, sizeof(int), &nh); int input_height = param.Input()->HeightOfOneBlock();
// clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
// clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); cl_int status;
// clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
// clSetKernelArg(kernel, 6, sizeof(int), &stride); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
// clSetKernelArg(kernel, 7, sizeof(int), &offset); status = clSetKernelArg(kernel, 1, sizeof(int), &w);
// clSetKernelArg(kernel, 8, sizeof(int), &input_c); status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
// clSetKernelArg(kernel, 9, sizeof(int), &dilation); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
// clSetKernelArg(kernel, 10, sizeof(int), &input_width); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
// clSetKernelArg(kernel, 11, sizeof(int), &input_height); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
// status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
// default_work_size.data(), NULL, 0, NULL, NULL); status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
// auto kernel = this->cl_helper_.KernelAt(0); status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
// size_t global_work_size[3] = {1, 2, 3}; status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
template class ConvKernel<GPU_CL, float>; template class ConvKernel<GPU_CL, float>;
......
...@@ -24,9 +24,9 @@ template <> ...@@ -24,9 +24,9 @@ template <>
bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
DLOG << " depthwise conv kernel init begin "; DLOG << " depthwise conv kernel init begin ";
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
"need equal"); "need equal");
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 - int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]); static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset); param->SetOffset(offset);
...@@ -36,7 +36,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -36,7 +36,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
} }
template <> template <>
void DepthwiseConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { void DepthwiseConvKernel<GPU_CL, float>::Compute(
const ConvParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0]; int c_block = default_work_size[0];
...@@ -54,23 +55,30 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) ...@@ -54,23 +55,30 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param)
int output_width = param.Output()->WidthOfOneBlock(); int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &c_block); cl_int status;
clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 2, sizeof(int), &nh); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); status = clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 6, sizeof(int), &stride); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
clSetKernelArg(kernel, 7, sizeof(int), &offset); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 8, sizeof(int), &input_c); status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
clSetKernelArg(kernel, 9, sizeof(int), &dilation); status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
clSetKernelArg(kernel, 10, sizeof(int), &input_width); status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
clSetKernelArg(kernel, 11, sizeof(int), &input_height); status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
clSetKernelArg(kernel, 12, sizeof(int), &output_width); status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
clSetKernelArg(kernel, 13, sizeof(int), &output_height); status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
template class DepthwiseConvKernel<GPU_CL, float>; template class DepthwiseConvKernel<GPU_CL, float>;
...@@ -78,4 +86,4 @@ template class DepthwiseConvKernel<GPU_CL, float>; ...@@ -78,4 +86,4 @@ template class DepthwiseConvKernel<GPU_CL, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
\ No newline at end of file
...@@ -12,42 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,42 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "common/log.h"
#include "operators/kernel/feed_kernel.h" #include "operators/kernel/feed_kernel.h"
#include "common/log.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) { bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
DLOG<<"Init feed"; DLOG << "Init feed";
this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
return true; return true;
} }
template <> template <>
void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
DLOG << "feed_kernel";
DLOG<<"feed_kernel"; auto kernel = this->cl_helper_.KernelAt(0);
auto kernel = this->cl_helper_.KernelAt(0); cl_int status;
cl_int status; auto output = param.Out();
auto output = param.Out(); auto input = param.InputX();
auto input = param.InputX(); DLOG << " input: " << input;
const float *input_data = input->data<float>();
cl_mem cl_image = output->GetCLImage(); const float *input_data = input->data<float>();
int height = output->dims()[2]; cl_mem cl_image = output->GetCLImage();
int width = output->dims()[3]; int height = output->dims()[2];
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&input_data); int width = output->dims()[3];
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&cl_image); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&width); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&height); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height);
size_t global_work_size[2] = {height,width};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); size_t global_work_size[2] = {height, width};
} clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
global_work_size, NULL, 0, NULL, NULL);
template class FeedKernel<GPU_CL, float>; }
} // namespace operators template class FeedKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -19,13 +19,13 @@ namespace paddle_mobile { ...@@ -19,13 +19,13 @@ namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL> *param) { bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
this->cl_helper_.AddKernel("relu", "relu.cl"); this->cl_helper_.AddKernel("relu", "relu.cl");
return true; return true;
} }
template <> template <>
void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL> &param) { void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
const auto* input = param.InputX(); const auto* input = param.InputX();
auto* output = param.Out(); auto* output = param.Out();
...@@ -34,7 +34,7 @@ void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL> &param) { ...@@ -34,7 +34,7 @@ void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL> &param) {
auto outputImage = output->GetCLImage(); auto outputImage = output->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
const size_t work_size[2] = { input->ImageWidth(), input->ImageHeight() }; const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
work_size, NULL, 0, NULL, NULL); work_size, NULL, 0, NULL, NULL);
} }
...@@ -43,4 +43,4 @@ template class ReluKernel<GPU_CL, float>; ...@@ -43,4 +43,4 @@ template class ReluKernel<GPU_CL, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
\ No newline at end of file
...@@ -25,30 +25,29 @@ bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) { ...@@ -25,30 +25,29 @@ bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
template <> template <>
void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) { void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
const auto * input = param.InputX(); const auto *input = param.InputX();
auto * output = param.Out(); auto *output = param.Out();
auto inputImage = input->GetCLImage(); auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage(); auto outputImage = output->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
const auto & inputDim = input->dims(); const auto &inputDim = input->dims();
const auto & outputDim = output->dims(); const auto &outputDim = output->dims();
int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]}; int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]};
int odims[4] = {outputDim[0], outputDim[1], outputDim[2], outputDim[3]}; int odims[4] = {outputDim[0], outputDim[1], outputDim[2], outputDim[3]};
clSetKernelArg(kernel, 2, sizeof(int), dims); clSetKernelArg(kernel, 2, sizeof(int), dims);
clSetKernelArg(kernel, 3, sizeof(int), dims+1); clSetKernelArg(kernel, 3, sizeof(int), dims + 1);
clSetKernelArg(kernel, 4, sizeof(int), dims+2); clSetKernelArg(kernel, 4, sizeof(int), dims + 2);
clSetKernelArg(kernel, 5, sizeof(int), dims+3); clSetKernelArg(kernel, 5, sizeof(int), dims + 3);
clSetKernelArg(kernel, 6, sizeof(int), odims); clSetKernelArg(kernel, 6, sizeof(int), odims);
clSetKernelArg(kernel, 7, sizeof(int), odims+1); clSetKernelArg(kernel, 7, sizeof(int), odims + 1);
clSetKernelArg(kernel, 8, sizeof(int), odims+2); clSetKernelArg(kernel, 8, sizeof(int), odims + 2);
clSetKernelArg(kernel, 9, sizeof(int), odims+3); clSetKernelArg(kernel, 9, sizeof(int), odims + 3);
const size_t work_size[2] = { output->ImageWidth(), output->ImageHeight() }; const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
work_size, NULL, 0, NULL, NULL); work_size, NULL, 0, NULL, NULL);
} }
template class ReshapeKernel<GPU_CL, float>; template class ReshapeKernel<GPU_CL, float>;
......
...@@ -29,18 +29,18 @@ template <> ...@@ -29,18 +29,18 @@ template <>
void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) { void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
const auto * input = param.InputX(); const auto *input = param.InputX();
auto * output = param.Out(); auto *output = param.Out();
auto inputImage = input->GetCLImage(); auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage(); auto outputImage = output->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
const auto & inputDim = input->dims(); const auto &inputDim = input->dims();
int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]}; int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]};
clSetKernelArg(kernel, 2, sizeof(int), dims); clSetKernelArg(kernel, 2, sizeof(int), dims);
clSetKernelArg(kernel, 3, sizeof(int), dims+1); clSetKernelArg(kernel, 3, sizeof(int), dims + 1);
clSetKernelArg(kernel, 4, sizeof(int), dims+2); clSetKernelArg(kernel, 4, sizeof(int), dims + 2);
clSetKernelArg(kernel, 5, sizeof(int), dims+3); clSetKernelArg(kernel, 5, sizeof(int), dims + 3);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 0, NULL, NULL);
......
...@@ -18,15 +18,15 @@ limitations under the License. */ ...@@ -18,15 +18,15 @@ limitations under the License. */
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using namespace framework; using namespace framework;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class FeedKernel class FeedKernel
: public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>>{ : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
public: public:
void Compute(const FeedParam<DeviceType> &param); void Compute(const FeedParam<DeviceType> &param);
bool Init(FeedParam<DeviceType> *param); bool Init(FeedParam<DeviceType> *param);
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -29,8 +29,8 @@ int main() { ...@@ -29,8 +29,8 @@ int main() {
bool optimize = true; bool optimize = true;
auto time1 = time(); auto time1 = time();
if (paddle_mobile.Load(g_googlenet, optimize)) { if (paddle_mobile.Load(g_googlenet, optimize)) {
auto time2 = time(); auto time2 = paddle_mobile::time();
std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
std::vector<float> input; std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims); GetInput<float>(g_test_image_1x3x224x224, &input, dims);
......
...@@ -19,14 +19,14 @@ limitations under the License. */ ...@@ -19,14 +19,14 @@ limitations under the License. */
int main() { int main() {
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
// paddle_mobile.SetThreadNum(4); // paddle_mobile.SetThreadNum(4);
auto time1 = time(); auto time1 = paddle_mobile::time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true); // std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_mobilenet, false); auto isok = paddle_mobile.Load(g_mobilenet, false);
if (isok) { if (isok) {
auto time2 = time(); auto time2 = paddle_mobile::time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" << std::endl;
std::vector<float> input; std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
...@@ -42,13 +42,13 @@ int main() { ...@@ -42,13 +42,13 @@ int main() {
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims); auto vec_result = paddle_mobile.Predict(input, dims);
} }
auto time3 = time(); auto time3 = paddle_mobile::time();
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims); auto vec_result = paddle_mobile.Predict(input, dims);
} }
DLOG << vec_result; DLOG << vec_result;
auto time4 = time(); auto time4 = paddle_mobile::time();
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 << "ms"
<< std::endl; << std::endl;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册