未验证 提交 ae68ce7a 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1054 from codeWorm2015/opencl

resolve complier error
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "CL/cl.h"
#include "common/enforce.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_tool.h"
namespace paddle_mobile {
namespace framework {
......
......@@ -43,17 +43,28 @@ class CLImage {
C = tensor_dims_[1];
H = tensor_dims_[2];
W = tensor_dims_[3];
width_of_one_block_ = W;
height_of_one_block_ = H;
} else if (tensor_dims_.size() == 1) {
N = 1;
C = tensor_dims_[0];
H = 1;
W = 1;
width_of_one_block_ = W;
height_of_one_block_ = H;
}
DLOG << "-------InitMemory-------";
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
image_width_ = width;
image_height_ = height;
std::unique_ptr<half_t[]> imageData{};
int count = 0;
if (tensorInput != nullptr) {
......@@ -95,9 +106,13 @@ class CLImage {
0, // size_t image_row_pitch
reinterpret_cast<void *>(imageData.get()), // void *host_ptr
&err);
if (err != CL_SUCCESS) {
// TODO(HaiPeng): error handling
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
}
initialized_ = true;
}
void Init(cl_context context, DDim ddim) { Init(context, nullptr, ddim); }
......@@ -109,8 +124,6 @@ class CLImage {
const DDim &dims() const { return tensor_dims_; }
std::vector<size_t> DefaultWorkSize() { return {}; }
cl_mem GetCLImage() const { return cl_image_; }
template <typename T>
......@@ -120,24 +133,24 @@ class CLImage {
inline int64_t numel() const { return product(tensor_dims_); }
int ImageWidth() const { return image_width_; }
inline size_t ImageWidth() const { return image_width_; }
int ImageHeight() const { return image_height_; }
inline size_t ImageHeight() const { return image_height_; }
int CBlock() const { return c_block_; }
inline size_t CBlock() const { return c_block_; }
int WidthOfOneBlock() const { return width_of_one_block_; }
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
int HeightOfOneBlock() const { return height_of_one_block_; }
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
private:
bool initialized_ = false;
cl_mem cl_image_;
int image_width_;
int width_of_one_block_;
int height_of_one_block_;
int image_height_;
int c_block_;
size_t image_width_;
size_t width_of_one_block_;
size_t height_of_one_block_;
size_t image_height_;
size_t c_block_;
DDim tensor_dims_;
float *tensor_input_;
cl_context context_;
......
......@@ -95,15 +95,15 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
*/
template <typename Dtype, Precision P>
void FusionAndPrintInfos(
bool optimize, bool can_add_split, const Program<Dtype, P> &program,
bool optimize, bool can_add_split, Program<Dtype, P> *program,
const std::shared_ptr<ProgramDesc> &originProgramDesc) {
if (optimize) {
ProgramOptimize program_optimize;
program.optimizeProgram =
program->optimizeProgram =
program_optimize.FusionOptimize(originProgramDesc, can_add_split);
}
if (optimize) {
program.optimizeProgram->Description("optimize: ");
program->optimizeProgram->Description("optimize: ");
} else {
originProgramDesc->Description("program: ");
}
......@@ -186,7 +186,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
// use originProgramDesc and scope to init tensors
InitMemoryFromProgram(originProgramDesc, scope);
// perform fusion and print infos
FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
return program;
......@@ -195,7 +195,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
template <typename Dtype, Precision P>
const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
size_t read_size, const uint8_t *buf, size_t combined_params_len,
const uint8_t *combined_params_buf, bool optimize, bool quantification) {
uint8_t *combined_params_buf, bool optimize, bool quantification) {
bool can_add_split = false;
PaddleMobile__Framework__Proto__ProgramDesc *c_program;
......@@ -221,7 +221,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
auto scope = std::make_shared<Scope>();
program.scope = scope;
InitMemoryFromProgram(originProgramDesc, scope);
FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
nullptr);
return program;
......
......@@ -46,7 +46,7 @@ class Loader {
const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
const uint8_t *model_buf,
size_t combined_params_len,
const uint8_t *combined_params_buf,
uint8_t *combined_params_buf,
bool optimize = false,
bool quantification = false);
......
......@@ -32,7 +32,7 @@ class Program {
bool combined = false;
bool quantification = false;
size_t combined_params_len;
const uint8_t *combined_params_buf;
uint8_t *combined_params_buf;
private:
};
......
......@@ -68,9 +68,10 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
}
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::LoadCombinedMemory(
size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
const uint8_t *combined_params_buf) {
bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
const uint8_t *model_buf,
size_t combined_params_len,
uint8_t *combined_params_buf) {
int batch_size = 1;
bool optimise = true;
bool quantification = false;
......
......@@ -83,7 +83,7 @@ class PaddleMobile {
*/
bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
size_t combined_params_len,
const uint8_t *combined_params_buf);
uint8_t *combined_params_buf);
void Clear();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
__kernel void elementwise_add(__global float* in, __global float* out) {
int num = get_global_id(0);
out[num] = in[num] * 0.1 + 102;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDBN_OP
#include "operators/kernel/conv_add_bn_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddBNReluKernel<GPU_CL, float>::Init(
FusionConvAddBNReluParam<GPU_CL> *param) {
return true;
}
template <>
void ConvAddBNReluKernel<GPU_CL, float>::Compute(
const FusionConvAddBNReluParam<GPU_CL> &param) {}
template class ConvAddBNReluKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -37,6 +37,7 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
auto bias_ptr = bias->data<float>();
const int C = mean->numel();
float inv_std_ptr[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
......@@ -55,8 +56,13 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
framework::CLImage *new_scale = new framework::CLImage();
new_scale->Init(this->cl_helper_.CLContext(), new_scale_ptr,
variance->dims());
framework::CLImage *new_bias = new framework::CLImage();
new_bias->Init(this->cl_helper_.CLContext(), new_bias_ptr, variance->dims());
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
......@@ -65,10 +71,23 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->SetOffset(param->Filter()->dims()[2] / 2 -
static_cast<int>(param->Paddings()[1]));
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) {
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) {
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
}
return true;
}
......
......@@ -21,12 +21,63 @@ namespace operators {
template <>
bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) {
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) {
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
}
return true;
}
template <>
void ConvAddKernel<GPU_CL, float>::Compute(
const FusionConvAddParam<GPU_CL> &param) {}
const FusionConvAddParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto biase = param.Bias()->GetCLImage();
auto output = param.Output();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &c_block);
clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 2, sizeof(int), &nh);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 9, sizeof(int), &stride);
clSetKernelArg(kernel, 10, sizeof(int), &offset);
clSetKernelArg(kernel, 11, sizeof(int), &input_c);
clSetKernelArg(kernel, 12, sizeof(int), &input_width);
clSetKernelArg(kernel, 13, sizeof(int), &input_height);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
template class ConvAddKernel<GPU_CL, float>;
......
......@@ -21,12 +21,62 @@ namespace operators {
template <>
bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
// this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) {
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) {
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
}
return true;
}
template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &c_block);
clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 2, sizeof(int), &nh);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 6, sizeof(int), &stride);
clSetKernelArg(kernel, 7, sizeof(int), &offset);
clSetKernelArg(kernel, 8, sizeof(int), &input_c);
clSetKernelArg(kernel, 9, sizeof(int), &dilation);
clSetKernelArg(kernel, 10, sizeof(int), &input_width);
clSetKernelArg(kernel, 11, sizeof(int), &input_height);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
// auto kernel = this->cl_helper_.KernelAt(0);
// size_t global_work_size[3] = {1, 2, 3};
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册