提交 3eef149e 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1084 from codeWorm2015/opencl

Opencl
...@@ -4,7 +4,7 @@ option(USE_OPENMP "openmp support" OFF) ...@@ -4,7 +4,7 @@ option(USE_OPENMP "openmp support" OFF)
project(paddle-mobile) project(paddle-mobile)
option(DEBUGING "enable debug mode" ON) option(DEBUGING "enable debug mode" ON)
option(USE_EXCEPTION "use std exception" OFF) option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" OFF) option(LOG_PROFILE "log profile" OFF)
# select the platform to build # select the platform to build
option(CPU "armv7 with neon" OFF) option(CPU "armv7 with neon" OFF)
......
...@@ -52,7 +52,7 @@ class CLEngine { ...@@ -52,7 +52,7 @@ class CLEngine {
cl_context context, std::string file_name) { cl_context context, std::string file_name) {
FILE *file = fopen(file_name.c_str(), "rb"); FILE *file = fopen(file_name.c_str(), "rb");
PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ", PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
filename.c_str()); file_name.c_str());
fseek(file, 0, SEEK_END); fseek(file, 0, SEEK_END);
int64_t size = ftell(file); int64_t size = ftell(file);
PADDLE_MOBILE_ENFORCE(size > 0, "size is too small"); PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
......
...@@ -488,7 +488,7 @@ static const uint8_t shifttable[512] = { ...@@ -488,7 +488,7 @@ static const uint8_t shifttable[512] = {
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
half_t float2half(float f) { half_t float2half(float f) {
uint32_t v = *reinterpret_cast<uint32_t*>(&f); uint32_t v = *reinterpret_cast<uint32_t *>(&f);
return basetable[(v >> 23) & 0x1ff] + return basetable[(v >> 23) & 0x1ff] +
((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
} }
...@@ -496,5 +496,17 @@ half_t float2half(float f) { ...@@ -496,5 +496,17 @@ half_t float2half(float f) {
float half2float(half_t h) { float half2float(half_t h) {
uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
exponenttable[h >> 10]; exponenttable[h >> 10];
return *reinterpret_cast<float*>(&v); return *reinterpret_cast<float *>(&v);
}
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
for (int i = 0; i < count; ++i) {
h_array[i] = float2half(f_array[i]);
}
}
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
for (int i = 0; i < count; ++i) {
f_array[i] = float2half(h_array[i]);
}
} }
...@@ -18,4 +18,9 @@ limitations under the License. */ ...@@ -18,4 +18,9 @@ limitations under the License. */
typedef uint16_t half_t; typedef uint16_t half_t;
half_t float2half(float f); half_t float2half(float f);
float half2float(half_t h); float half2float(half_t h);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include "common/log.h"
#include "framework/cl/cl_deleter.h" #include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
#include "framework/cl/cl_scope.h" #include "framework/cl/cl_scope.h"
...@@ -32,11 +33,16 @@ class CLHelper { ...@@ -32,11 +33,16 @@ class CLHelper {
explicit CLHelper(CLScope *scope) : scope_(scope) {} explicit CLHelper(CLScope *scope) : scope_(scope) {}
void AddKernel(const std::string &kernel_name, const std::string &file_name) { void AddKernel(const std::string &kernel_name, const std::string &file_name) {
DLOG << " begin add kernel ";
auto kernel = scope_->GetKernel(kernel_name, file_name); auto kernel = scope_->GetKernel(kernel_name, file_name);
DLOG << " add kernel ing ";
kernels.emplace_back(std::move(kernel)); kernels.emplace_back(std::move(kernel));
} }
cl_kernel KernelAt(const int index) { return kernels[index].get(); } cl_kernel KernelAt(const int index) {
DLOG << " kernel count: " << kernels.size();
return kernels[index].get();
}
cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); } cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); }
......
...@@ -17,7 +17,9 @@ limitations under the License. */ ...@@ -17,7 +17,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "CL/cl.h" #include "CL/cl.h"
#include "framework/cl/cl_half.h" #include "framework/cl/cl_half.h"
#include "framework/cl/cl_tool.h"
#include "framework/ddim.h" #include "framework/ddim.h"
#include "framework/tensor.h" #include "framework/tensor.h"
...@@ -59,6 +61,7 @@ class CLImage { ...@@ -59,6 +61,7 @@ class CLImage {
PADDLE_MOBILE_THROW_EXCEPTION( PADDLE_MOBILE_THROW_EXCEPTION(
" empty image tensor data shouldn't have value"); " empty image tensor data shouldn't have value");
} }
DLOG << " init empty image ";
InitCLImage(context, nullptr, dim); InitCLImage(context, nullptr, dim);
initialized_ = true; initialized_ = true;
} }
...@@ -98,7 +101,8 @@ class CLImage { ...@@ -98,7 +101,8 @@ class CLImage {
T *data() const { T *data() const {
if (initialized_) { if (initialized_) {
PADDLE_MOBILE_THROW_EXCEPTION( PADDLE_MOBILE_THROW_EXCEPTION(
" cl image has initialized, tensor data has been deleted "); " cl image has initialized, tensor data has been deleted, can't use "
"tensor data");
} }
return reinterpret_cast<T *>(tensor_data_); return reinterpret_cast<T *>(tensor_data_);
} }
...@@ -115,6 +119,7 @@ class CLImage { ...@@ -115,6 +119,7 @@ class CLImage {
private: private:
void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) { void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) {
DLOG << " tensor dim: " << dim;
cl_image_format cf = {.image_channel_order = CL_RGBA, cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT}; .image_channel_data_type = CL_HALF_FLOAT};
// NCHW -> [W * (C+3)/4, H * N] // NCHW -> [W * (C+3)/4, H * N]
...@@ -132,28 +137,22 @@ class CLImage { ...@@ -132,28 +137,22 @@ class CLImage {
tensor_data_[i] = 0; tensor_data_[i] = 0;
} }
} }
size_t N, C, H, W;
if (tensor_dims_.size() == 4) { size_t new_dims[] = {1, 1, 1, 1};
N = tensor_dims_[0];
if (N < 0) { for (int j = 0; j < dim.size(); ++j) {
N = 1; new_dims[4 - dim.size() + j] = dim[j];
} }
C = tensor_dims_[1];
H = tensor_dims_[2];
W = tensor_dims_[3];
width_of_one_block_ = W; size_t N, C, H, W;
height_of_one_block_ = H;
} else if (tensor_dims_.size() == 1) { N = new_dims[0];
N = 1; C = new_dims[1];
C = tensor_dims_[0]; H = new_dims[2];
H = 1; W = new_dims[3];
W = 1;
width_of_one_block_ = W; width_of_one_block_ = W;
height_of_one_block_ = H; height_of_one_block_ = H;
}
size_t width = W * ((C + 3) / 4); size_t width = W * ((C + 3) / 4);
size_t height = H * N; size_t height = H * N;
...@@ -193,9 +192,12 @@ class CLImage { ...@@ -193,9 +192,12 @@ class CLImage {
} }
} }
cl_int err; cl_int err;
DLOG << " image width: " << width;
DLOG << " image height: " << height;
cl_image_ = clCreateImage2D( cl_image_ = clCreateImage2D(
context, // cl_context context context, // cl_context context
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, // cl_mem_flags flags CL_MEM_READ_WRITE |
(imageData ? CL_MEM_COPY_HOST_PTR : 0), // cl_mem_flags flags
&cf, // const cl_image_format *image_format &cf, // const cl_image_format *image_format
width, // size_t image_width width, // size_t image_width
height, // size_t image_height height, // size_t image_height
...@@ -205,6 +207,7 @@ class CLImage { ...@@ -205,6 +207,7 @@ class CLImage {
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
// TODO(HaiPeng): error handling // TODO(HaiPeng): error handling
CL_CHECK_ERRORS(err);
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
} }
} }
...@@ -222,9 +225,15 @@ class CLImage { ...@@ -222,9 +225,15 @@ class CLImage {
cl_context context_; cl_context context_;
}; };
void TensorToCLImage(Tensor *tensor, CLImage *image,cl_command_queue commandQueue); void TensorToCLImage(Tensor *tensor, CLImage *image,
cl_command_queue commandQueue);
void CLImageToTensor(CLImage *image, Tensor *tensor,
cl_command_queue commandQueue);
void CLImageToTensor(CLImage *image, Tensor *tensor,cl_command_queue commandQueue); #ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &image);
#endif
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -40,8 +40,11 @@ class CLScope { ...@@ -40,8 +40,11 @@ class CLScope {
std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel( std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
const std::string &kernel_name, const std::string &file_name) { const std::string &kernel_name, const std::string &file_name) {
auto program = Program(file_name); auto program = Program(file_name);
DLOG << " get program ~ ";
std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel( std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
clCreateKernel(program, kernel_name.c_str(), NULL)); clCreateKernel(program, kernel_name.c_str(), &status_));
CL_CHECK_ERRORS(status_);
DLOG << " create kernel ~ ";
return std::move(kernel); return std::move(kernel);
} }
...@@ -58,11 +61,12 @@ class CLScope { ...@@ -58,11 +61,12 @@ class CLScope {
status_ = status_ =
clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0); clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
CL_CHECK_ERRORS(status_); CL_CHECK_ERRORS(status_);
programs_[file_name] = std::move(program); programs_[file_name] = std::move(program);
return program.get(); return programs_[file_name].get();
} }
private: private:
......
...@@ -37,6 +37,8 @@ limitations under the License. */ ...@@ -37,6 +37,8 @@ limitations under the License. */
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
#endif #endif
int debug_to = 2;
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
...@@ -85,7 +87,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -85,7 +87,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
for (int i = 0; i < blocks.size(); ++i) { for (int i = 0; i < blocks.size(); ++i) {
std::shared_ptr<framework::BlockDesc> block_desc = blocks[i]; std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
for (int j = 0; j < ops.size(); ++j) { for (int j = 0; j < debug_to; ++j) {
std::shared_ptr<framework::OpDesc> op = ops[j]; std::shared_ptr<framework::OpDesc> op = ops[j];
DLOG << "create op: " << j << " " << op->Type(); DLOG << "create op: " << j << " " << op->Type();
auto op_base = framework::OpRegistry<Dtype>::CreateOp( auto op_base = framework::OpRegistry<Dtype>::CreateOp(
...@@ -414,7 +416,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -414,7 +416,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
} }
} }
#else #else
for (int i = 0; i < ops.size(); i++) { for (int i = 0; i < debug_to; i++) {
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
struct timespec ts; struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts); clock_gettime(CLOCK_MONOTONIC, &ts);
...@@ -428,6 +430,11 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -428,6 +430,11 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
#endif #endif
} }
#endif #endif
DLOG << " predict return nullptr";
return nullptr;
auto last_op = ops.rbegin(); auto last_op = ops.rbegin();
auto output_map = (*last_op)->Outputs(); auto output_map = (*last_op)->Outputs();
std::vector<std::string> out_keys = (*last_op)->GetOutKeys(); std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
...@@ -647,6 +654,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -647,6 +654,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
const std::vector<Ptype> &input, const std::vector<int64_t> &dims) { const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
framework::Tensor tensor(input, framework::make_ddim(dims)); framework::Tensor tensor(input, framework::make_ddim(dims));
std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0); std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
if (output_tensor != nullptr) {
Executor<Dtype, P>::Ptype *output_ptr = Executor<Dtype, P>::Ptype *output_ptr =
output_tensor->data<typename Executor<Dtype, P>::Ptype>(); output_tensor->data<typename Executor<Dtype, P>::Ptype>();
std::vector<typename Executor<Dtype, P>::Ptype> result_vector; std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
...@@ -654,6 +662,10 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -654,6 +662,10 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
result_vector.push_back(output_ptr[j]); result_vector.push_back(output_ptr[j]);
} }
return result_vector; return result_vector;
} else {
DLOG << "return empty vector";
return {};
}
} }
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
......
...@@ -57,7 +57,10 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {} ...@@ -57,7 +57,10 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
template <typename Dtype> template <typename Dtype>
void OperatorBase<Dtype>::Run() { void OperatorBase<Dtype>::Run() {
DLOG << " begin run " << type_;
RunImpl(); RunImpl();
DLOG << " end run " << type_;
return;
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------"; DLOG << "-------------" << type_ << "----------------------------";
vector<string> input_keys = GetInputKeys(); vector<string> input_keys = GetInputKeys();
...@@ -100,8 +103,9 @@ void OperatorBase<Dtype>::Run() { ...@@ -100,8 +103,9 @@ void OperatorBase<Dtype>::Run() {
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
if (type_ == "fetch") { if (type_ == "fetch") {
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>(); Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor) if (tensor) {
DLOG << type_ << " output- " << key << "=" << tensor->dims(); DLOG << type_ << " output- " << key << "=" << tensor->dims();
}
} else { } else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>(); CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
// cl_command_queue commandQueue = // cl_command_queue commandQueue =
......
...@@ -12,4 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,4 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "conv_kernel.inc.cl" //#include "conv_kernel.inc.cl"
__kernel void conv_3x3() {}
\ No newline at end of file
...@@ -24,9 +24,16 @@ namespace operators { ...@@ -24,9 +24,16 @@ namespace operators {
template <> template <>
bool ConvAddBNReluKernel<GPU_CL, float>::Init( bool ConvAddBNReluKernel<GPU_CL, float>::Init(
FusionConvAddBNReluParam<GPU_CL> *param) { FusionConvAddBNReluParam<GPU_CL> *param) {
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext());
param->Bias()->InitCLImage(cl_helper_.CLContext());
// const CL *mean = param->InputMean(); // const CL *mean = param->InputMean();
const framework::CLImage *mean = param->InputMean(); const framework::CLImage *mean = param->InputMean();
const framework::CLImage *variance = param->InputVariance(); const framework::CLImage *variance = param->InputVariance();
const framework::CLImage *scale = param->InputScale(); const framework::CLImage *scale = param->InputScale();
const framework::CLImage *bias = param->InputBias(); const framework::CLImage *bias = param->InputBias();
...@@ -52,9 +59,6 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -52,9 +59,6 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
} }
delete[](new_scale_ptr);
delete[](new_bias_ptr);
framework::CLImage *new_scale = new framework::CLImage(); framework::CLImage *new_scale = new framework::CLImage();
new_scale->SetTensorData(new_scale_ptr, variance->dims()); new_scale->SetTensorData(new_scale_ptr, variance->dims());
...@@ -68,6 +72,9 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -68,6 +72,9 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param->SetNewScale(new_scale); param->SetNewScale(new_scale);
param->SetNewBias(new_bias); param->SetNewBias(new_bias);
delete[](new_scale_ptr);
delete[](new_bias_ptr);
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
......
...@@ -25,6 +25,9 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) { ...@@ -25,6 +25,9 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
"need equal"); "need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext());
param->Bias()->InitCLImage(cl_helper_.CLContext());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 - int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]); static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset); param->SetOffset(offset);
......
...@@ -26,18 +26,32 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -26,18 +26,32 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
"need equal"); "need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 - int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]); static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset); param->SetOffset(offset);
DLOG << " init helper: " << &cl_helper_;
DLOG << " conv kernel add kernel ~ ";
DLOG << " width of one block: " << param->Filter()->WidthOfOneBlock();
DLOG << " height of one block: " << param->Filter()->HeightOfOneBlock();
DLOG << " filter dims: " << param->Filter()->dims();
if (param->Filter()->WidthOfOneBlock() == 1 && if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) { param->Filter()->HeightOfOneBlock() == 1) {
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); DLOG << " here1 ";
this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) { } else if (param->Filter()->dims()[1] == 1) {
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); DLOG << " here2 ";
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 && } else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) { param->Filter()->HeightOfOneBlock() == 3) {
this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); DLOG << " here3 ";
this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
} else { } else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support "); PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
} }
...@@ -47,14 +61,27 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -47,14 +61,27 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
template <> template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
DLOG << " Compute helper: " << &cl_helper_;
DLOG << " begin compute ";
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
DLOG << " get work size ";
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
DLOG << " end work size ";
int c_block = default_work_size[0]; int c_block = default_work_size[0];
int w = default_work_size[1]; int w = default_work_size[1];
int nh = default_work_size[2]; int nh = default_work_size[2];
auto input = param.Input()->GetCLImage(); auto input = param.Input()->GetCLImage();
DLOG << " get Input ";
auto filter = param.Filter()->GetCLImage(); auto filter = param.Filter()->GetCLImage();
DLOG << " get Filter ";
auto output = param.Output(); auto output = param.Output();
DLOG << " get Output ";
int stride = param.Strides()[0]; int stride = param.Strides()[0];
int offset = param.Offset(); int offset = param.Offset();
int input_c = param.Input()->CBlock(); int input_c = param.Input()->CBlock();
...@@ -64,6 +91,8 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { ...@@ -64,6 +91,8 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
cl_int status; cl_int status;
DLOG << " begin set kernel arg ";
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
status = clSetKernelArg(kernel, 1, sizeof(int), &w); status = clSetKernelArg(kernel, 1, sizeof(int), &w);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh); status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
...@@ -77,12 +106,18 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { ...@@ -77,12 +106,18 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
DLOG << " end set kernel arg ";
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
DLOG << " begin enqueue ";
status = status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 0, NULL, NULL);
DLOG << " end enqueue ";
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} }
......
...@@ -27,6 +27,7 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -27,6 +27,7 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
"need equal"); "need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 - int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]); static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset); param->SetOffset(offset);
......
...@@ -948,6 +948,7 @@ class FetchParam : public OpParam { ...@@ -948,6 +948,7 @@ class FetchParam : public OpParam {
input_x_ = InputXFrom<GType>(inputs, scope); input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom(outputs, scope); out_ = OutFrom(outputs, scope);
} }
const RType *InputX() const { return input_x_; } const RType *InputX() const { return input_x_; }
Tensor *Out() const { return out_; } Tensor *Out() const { return out_; }
......
...@@ -34,23 +34,24 @@ int main() { ...@@ -34,23 +34,24 @@ int main() {
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims); GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims); auto vec_result = paddle_mobile.Predict(input, dims);
std::vector<float>::iterator biggest = // std::vector<float>::iterator biggest =
std::max_element(std::begin(vec_result), std::end(vec_result)); // std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position " // std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl; // << std::distance(std::begin(vec_result), biggest) <<
// std::endl;
// 预热十次
for (int i = 0; i < 10; ++i) { // for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims); // auto vec_result = paddle_mobile.Predict(input, dims);
} // }
auto time3 = paddle_mobile::time(); // auto time3 = paddle_mobile::time();
for (int i = 0; i < 10; ++i) { // for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims); // auto vec_result = paddle_mobile.Predict(input, dims);
} // }
DLOG << vec_result; // DLOG << vec_result;
auto time4 = paddle_mobile::time(); // auto time4 = paddle_mobile::time();
std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 // std::cout << "predict cost :" << paddle_mobile::time_diff(time3,
<< "ms" << std::endl; // time4) / 10 << "ms"
// << std::endl;
} }
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
......
cmake_minimum_required(VERSION 3.6)
project(web-exporter)
set(CMAKE_CXX_STANDARD 11)
file(GLOB PADDLE_MOBILE_CPP_FILES
"../../src/common/*.c"
"../../src/common/*.cpp"
"../../src/memory/*.cpp"
"../../src/framework/*.c"
"../../src/framework/*.cpp"
"../../src/framework/program/*.cpp"
"../../src/framework/program/program-optimize/*.cpp"
)
file(GLOB EXPORT_CPP_FILES "*.cpp")
add_executable(web-exporter ${PADDLE_MOBILE_CPP_FILES} ${EXPORT_CPP_FILES})
target_include_directories(web-exporter PRIVATE "../../src")
target_link_libraries(web-exporter)
\ No newline at end of file
#include "export.h"
inline std::string indent(int i) {
return std::string(i, ' ');
}
void export_nodejs(ProgramPtr program, ScopePtr scope, std::ostream & os) {
os << "module.exports.program = {\n";
os << indent(2) << var2str("blocks") << ": [\n";
for (const auto& block: program->Blocks()) {
os << indent(4) << "{\n";
os << indent(6) << var2str("vars") << ": {\n";
for (const auto& var: block->Vars()) {
const auto& dim = var->Tensor_desc().Dims();
os << indent(8) << var2str(var->Name()) << ": {\n";
os << indent(10) << var2str("dim") << ": " << var2str(dim) << ",\n";
os << indent(10) << var2str("persistable") << ": " << var2str(var->Persistable()) << "\n";
os << indent(8) << "},\n";
}
os << indent(6) << "},\n";
os << indent(6) << var2str("ops") << ": [\n";
for (const auto& op: block->Ops()) {
os << indent(8) << "{\n";
os << indent(10) << var2str("type") << ": " << var2str(op->Type()) << ",\n";
os << indent(10) << var2str("inputs") << ": {\n";
for (const auto& kv: op->GetInputs()) {
os << indent(12) << var2str(kv.first) << ": " << var2str(kv.second) << ",\n";
}
os << indent(10) << "},\n";
os << indent(10) << var2str("outputs") << ": {\n";
for (const auto& kv: op->GetInputs()) {
os << indent(12) << var2str(kv.first) << ": " << var2str(kv.second) << ",\n";
}
os << indent(10) << "},\n";
os << indent(10) << var2str("attrs") << ": {\n";
for (const auto& kv: op->GetAttrMap()) {
os << indent(12) << var2str(kv.first) << ": ";
os << decltype(kv.second)::ApplyVistor(VarVisitor(), kv.second) << ",\n";
}
os << indent(10) << "},\n";
os << indent(8) << "},\n";
}
os << indent(6) << "],\n";
os << indent(4) << "},\n";
}
os << indent(2) << "]\n";
os << "}\n";
}
#include <cstdio>
#include "export.h"
void export_scope(ProgramPtr program, ScopePtr scope, const std::string & dirname) {
for (const auto& block: program->Blocks()) {
for (const auto& var: block->Vars()) {
if (var->Name() == "feed" || var->Name() == "fetch") {
continue;
}
if (var->Persistable()) {
auto* v = scope->FindVar(var->Name());
assert(v != nullptr);
int count = 1;
for (auto n: var->Tensor_desc().Dims()) {
count *= n;
}
auto* tensor = v->GetMutable<paddle_mobile::framework::LoDTensor>();
const float * p = tensor->mutable_data<float>();
std::string para_file_name = dirname + '/' + var->Name();
FILE *para_file = fopen(para_file_name.c_str(), "w");
assert(p != nullptr);
fwrite(p, sizeof(float), count, para_file);
fclose(para_file);
// std::cout << "==> " << var->Name() << " " << count << "\n";
// for (int i = 0; i < count; i++) {
// std::cout << p[i] << ", ";
// }
// std::cout << "\n";
}
}
}
}
#include "export.h"
#include <sys/stat.h>
#include <sys/types.h>
class FakeExecutor : public paddle_mobile::framework::Executor<paddle_mobile::CPU, paddle_mobile::Precision::FP32> {
public:
FakeExecutor(const paddle_mobile::framework::Program<paddle_mobile::CPU> p) {
program_ = p;
batch_size_ = 1;
use_optimize_ = true;
loddable_ = false;
if (use_optimize_) {
to_predict_program_ = program_.optimizeProgram;
} else {
to_predict_program_ = program_.originProgram;
}
auto *variable_ptr = program_.scope->Var("batch_size");
variable_ptr[0].SetValue<int>(1);
if (program_.combined) {
InitCombineMemory();
} else {
InitMemory();
}
}
};
int main(int argc, char** argv) {
if (argc != 3) {
std::cout << "Usage: " << argv[0] << " <combined-modle-dir> <output-dir>\n";
return -1;
}
std::string model_dir = argv[1];
std::string model_path = model_dir + "/model";
std::string para_path = model_dir + "/params";
std::string out_dir = argv[2];
std::string out_model_js = out_dir + "/model.js";
std::string out_para_dir = out_dir + "/paras";
mkdir(out_dir.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);
mkdir(out_para_dir.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);
std::cout << "loading " << model_path << " & " << para_path << "\n";
paddle_mobile::framework::Loader<> loader;
auto program = loader.Load(model_path, para_path, true);
FakeExecutor executor(program);
auto optimizedProgram = program.optimizeProgram;
export_scope(optimizedProgram, program.scope, out_para_dir);
std::ofstream fs(out_model_js.c_str());
export_nodejs(optimizedProgram, program.scope, fs);
fs.close();
return 0;
}
#pragma once
#include <iostream>
#include <vector>
#include <memory>
#include <string>
#include <ostream>
#include <fstream>
#include "framework/loader.h"
#include "framework/executor.h"
#include "framework/scope.h"
#include "framework/program/program_desc.h"
// using paddle_mobile::framework::ProgramDesc;
// using paddle_mobile::framework::Scope;
using ProgramPtr = std::shared_ptr<paddle_mobile::framework::ProgramDesc>;
using ScopePtr = std::shared_ptr<paddle_mobile::framework::Scope>;
void export_nodejs(ProgramPtr program, ScopePtr scope, std::ostream & os = std::cout);
void export_scope(ProgramPtr program, ScopePtr scope, const std::string & dirname = ".");
template <typename T>
inline std::string var2str(const T & v) {
return std::to_string(v);
}
template <>
inline std::string var2str(const std::string & v) {
return "\"" + v + "\"";
}
inline std::string var2str(const char* v) {
return var2str<std::string>(v);
}
inline std::string var2str(const bool v) {
return v ? "true" : "false";
}
template <typename T>
std::string var2str(const std::vector<T> & v) {
std::string r = "[";
auto s = v.size();
for (int i = 0; i < s; i++) {
if (i) r += ", ";
r += var2str(v[i]);
}
return r + "]";
}
struct VarVisitor {
using type_t = decltype(var2str(0));
template <typename T>
type_t operator()(const T & v) {
return var2str(v);
}
};
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册