提交 86044671 编写于 作者: L liuruilong

update conv kernel

上级 e81515d3
......@@ -16,7 +16,6 @@ file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
include_directories(src/)
if(IS_IOS)
set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
else()
......
......@@ -16,6 +16,9 @@ limitations under the License. */
#include "framework/cl/cl_half.h"
namespace paddle_mobile {
namespace framework {
static const uint32_t mantissatable[2048] = {
0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
......@@ -510,3 +513,6 @@ void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
f_array[i] = Half2Float(h_array[i]);
}
}
} // namespace framework
} // namespace paddle_mobile
......@@ -15,6 +15,9 @@ limitations under the License. */
#pragma once
#include <cstdint>
namespace paddle_mobile {
namespace framework {
typedef uint16_t half_t;
half_t Float2Half(float f);
......@@ -24,3 +27,6 @@ float Half2Float(half_t h);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
} // namespace framework
} // namespace paddle_mobile
......@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "cl_image.h"
#include "framework/cl/cl_image.h"
namespace paddle_mobile {
namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
......@@ -63,7 +64,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
}
if (err != CL_SUCCESS) {
// TODO: error handling
CL_CHECK_ERRORS(err);
}
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
......@@ -97,7 +98,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
if (err != CL_SUCCESS) {
// TODO: error handling
CL_CHECK_ERRORS(err);
}
size_t i0 = 0;
for (int n = 0; n < N; n++) {
......@@ -117,7 +118,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
}
}
#ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &cl_image){
Print &operator<<(Print &printer, const CLImage &cl_image) {
printer << " dims: " << cl_image.dims() << "\n";
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
......@@ -148,8 +149,8 @@ Print &operator<<(Print &printer, const CLImage &cl_image){
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, imageData, 0, NULL, NULL);
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
......@@ -168,13 +169,13 @@ Print &operator<<(Print &printer, const CLImage &cl_image){
}
if (err != CL_SUCCESS) {
// TODO: error handling
CL_CHECK_ERRORS(err);
}
for (int i = 0; i < cl_image.numel(); i += stride) {
printer << data[i] << " ";
}
for (int i = 0; i < cl_image.numel(); i += stride) {
printer << data[i] << " ";
}
return printer;
}
}
#endif
} // namespace framework
} // namespace paddle_mobile
......@@ -46,27 +46,28 @@ class CLImage {
/*
* need call SetTensorData first
* */
void InitCLImage(cl_context context,cl_command_queue command_queue) {
void InitCLImage(cl_context context, cl_command_queue command_queue) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
if (tensor_dims_.size() <= 2) {
InitCLImage2C(context, command_queue,tensor_data_, tensor_dims_);
InitCLImage2C(context, command_queue, tensor_data_, tensor_dims_);
} else {
InitCLImage(context, command_queue,tensor_data_, tensor_dims_);
InitCLImage(context, command_queue, tensor_data_, tensor_dims_);
}
delete[](tensor_data_);
tensor_data_ = nullptr;
initialized_ = true;
}
void InitEmptyImage(cl_context context, cl_command_queue command_queue,const DDim &dim) {
void InitEmptyImage(cl_context context, cl_command_queue command_queue,
const DDim &dim) {
if (tensor_data_ != nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(
" empty image tensor data shouldn't have value");
}
DLOG << " init empty image ";
InitCLImage(context, command_queue,nullptr, dim);
InitCLImage(context, command_queue, nullptr, dim);
initialized_ = true;
}
......@@ -93,7 +94,7 @@ class CLImage {
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
inline cl_command_queue CommandQueue() const{ return command_queue_;}
inline cl_command_queue CommandQueue() const { return command_queue_; }
/*
* resize original tensor dim
......@@ -124,7 +125,8 @@ class CLImage {
const DDim &dims() const { return tensor_dims_; }
private:
void InitCLImage2C(cl_context context, cl_command_queue command_queue,float *tensor_data, const DDim &dim) {
void InitCLImage2C(cl_context context, cl_command_queue command_queue,
float *tensor_data, const DDim &dim) {
command_queue_ = command_queue;
assert(dim.size() <= 2);
int tdim[2] = {1, 1};
......@@ -141,43 +143,44 @@ class CLImage {
imageData.reset(new half_t[width * height * 4]);
for (int h = 0; h < tdim[0]; h++) {
for (int w = 0; w < tdim[1]; w++) {
imageData[(h * width + w / 4) * 4 + (w % 4)] = Float2Half(tensor_data[h * tdim[1] + w]);
imageData[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor_data[h * tdim[1] + w]);
}
}
}
InitCLImage(context, width, height, imageData.get());
}
void InitCLImage(cl_context context,int width, int height, void *data) {
void InitCLImage(cl_context context, int width, int height, void *data) {
cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT};
cl_image_desc cid = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = width,
.image_height = height,
.image_depth = 1,
.image_array_size = 1,
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
// .buffer = nullptr
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = width,
.image_height = height,
.image_depth = 1,
.image_array_size = 1,
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
// .buffer = nullptr
};
cid.buffer = nullptr;
cl_int err;
cl_image_ = clCreateImage(
context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
&cf, // const cl_image_format *image_format
&cid, // const cl_image_desc *image_desc
data, // void *host_ptr
&err
);
context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
&cf, // const cl_image_format *image_format
&cid, // const cl_image_desc *image_desc
data, // void *host_ptr
&err);
if (err != CL_SUCCESS) {
CL_CHECK_ERRORS(err);
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
}
}
void InitCLImage(cl_context context, cl_command_queue command_queue,float *tensor_data, const DDim &dim) {
void InitCLImage(cl_context context, cl_command_queue command_queue,
float *tensor_data, const DDim &dim) {
DLOG << " tensor dim: " << dim;
// NCHW -> [W * (C+3)/4, H * N]
tensor_dims_ = dim;
......@@ -207,6 +210,7 @@ class CLImage {
image_width_ = width;
image_height_ = height;
image_dims_ = make_ddim({image_width_, image_height_});
c_block_ = W / width;
std::unique_ptr<half_t[]> imageData{};
int count = 0;
......
......@@ -37,7 +37,7 @@ limitations under the License. */
#include "framework/cl/cl_image.h"
#endif
int debug_to = 115;
int debug_to = 3;
namespace paddle_mobile {
namespace framework {
......@@ -953,13 +953,14 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue = program_.scope->GetCLScpoe()->CommandQueue();
cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
framework::DDim ddim = cl_image->dims();
DLOG << var_desc->Name();
cl_image->InitEmptyImage(context,command_queue, ddim);
cl_image->InitEmptyImage(context, command_queue, ddim);
}
}
}
......@@ -1011,11 +1012,12 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
} else {
auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue = program_.scope->GetCLScpoe()->CommandQueue();
cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = cl_image->dims();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->InitEmptyImage(context, command_queue,ddim);
cl_image->InitEmptyImage(context, command_queue, ddim);
}
}
}
......
......@@ -73,7 +73,7 @@ void OperatorBase<Dtype>::Run() {
} else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
if (cl_image) {
DLOG << type_ << " input- " << key << "="<<*cl_image;
DLOG << type_ << " input- " << key << "=" << *cl_image;
}
}
......@@ -98,7 +98,7 @@ void OperatorBase<Dtype>::Run() {
} else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
if (cl_image) {
DLOG << type_ << " output- " << key << "="<<*cl_image ;
DLOG << type_ << " output- " << key << "=" << *cl_image;
}
}
......
......@@ -49,11 +49,13 @@ bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
framework::CLImage *new_scale = new framework::CLImage();
new_scale->SetTensorData(new_scale_ptr, variance->dims());
new_scale->InitCLImage(this->cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
new_scale->InitCLImage(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
framework::CLImage *new_bias = new framework::CLImage();
new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
new_bias->InitCLImage(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
......
......@@ -19,6 +19,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
......
......@@ -29,8 +29,10 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),cl_helper_.CLCommandQueue());
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
// const CL *mean = param->InputMean();
const framework::CLImage *mean = param->InputMean();
......@@ -62,12 +64,14 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
framework::CLImage *new_scale = new framework::CLImage();
new_scale->SetTensorData(new_scale_ptr, variance->dims());
new_scale->InitCLImage(this->cl_helper_.CLContext(),cl_helper_.CLCommandQueue());
new_scale->InitCLImage(this->cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
framework::CLImage *new_bias = new framework::CLImage();
new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext(),cl_helper_.CLCommandQueue());
new_bias->InitCLImage(this->cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
......
......@@ -25,8 +25,10 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
param->Filter()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
......
......@@ -26,7 +26,8 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
param->Filter()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
......@@ -95,6 +96,17 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
cl_int status;
DLOG << " begin set kernel arg ";
DLOG << " c block " << c_block;
DLOG << " w " << w;
DLOG << " nh " << nh;
DLOG << " stride " << stride;
DLOG << " offset " << offset;
DLOG << " input_c " << input_c;
DLOG << " dilation " << dilation;
DLOG << " input width " << input_width;
DLOG << " input height " << input_height;
DLOG << " output width " << output_width;
DLOG << " output height " << output_height;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
......
......@@ -27,7 +27,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue());
param->Filter()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
......
......@@ -30,7 +30,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
cl_int status;
auto output = param.Out();
const Tensor *input = param.InputX();
DLOG<<*input;
DLOG << *input;
const float *input_data = input->data<float>();
int numel = input->numel();
cl_mem cl_image = output->GetCLImage();
......
......@@ -19,44 +19,45 @@ namespace operators {
template <>
bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
// this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
// this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
return true;
}
template <>
void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
// auto kernel = this->cl_helper_.KernelAt(0);
// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
//
// auto input = param.InputX()->GetCLImage();
// auto *out = param.Out();
//
// const auto &dims = param.InputX()->dims();
// const int N = dims[0];
// const int C = dims[1];
// const int in_height = dims[2];
// const int in_width = dims[3];
//
// int size_ch = in_height * in_width;
// int size_block = size_ch * 4;
// int size_batch = size_ch * C;
//
// // need create outputBuffer
// cl_image_format imageFormat;
// imageFormat.image_channel_order = CL_RGBA;
// imageFormat.image_channel_data_type = CL_FLOAT;
// cl_mem outputBuffer;
//
// clSetKernelArg(kernel, 0, sizeof(int), &in_height);
// clSetKernelArg(kernel, 1, sizeof(int), &in_width);
// clSetKernelArg(kernel, 2, sizeof(int), &size_ch);
// clSetKernelArg(kernel, 3, sizeof(int), &size_block);
// clSetKernelArg(kernel, 4, sizeof(int), &size_batch);
// clSetKernelArg(kernel, 5, sizeof(cl_mem), &input);
// clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer);
//
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL);
// auto kernel = this->cl_helper_.KernelAt(0);
// auto default_work_size =
// this->cl_helper_.DefaultWorkSize(*param.InputX());
//
// auto input = param.InputX()->GetCLImage();
// auto *out = param.Out();
//
// const auto &dims = param.InputX()->dims();
// const int N = dims[0];
// const int C = dims[1];
// const int in_height = dims[2];
// const int in_width = dims[3];
//
// int size_ch = in_height * in_width;
// int size_block = size_ch * 4;
// int size_batch = size_ch * C;
//
// // need create outputBuffer
// cl_image_format imageFormat;
// imageFormat.image_channel_order = CL_RGBA;
// imageFormat.image_channel_data_type = CL_FLOAT;
// cl_mem outputBuffer;
//
// clSetKernelArg(kernel, 0, sizeof(int), &in_height);
// clSetKernelArg(kernel, 1, sizeof(int), &in_width);
// clSetKernelArg(kernel, 2, sizeof(int), &size_ch);
// clSetKernelArg(kernel, 3, sizeof(int), &size_block);
// clSetKernelArg(kernel, 4, sizeof(int), &size_batch);
// clSetKernelArg(kernel, 5, sizeof(cl_mem), &input);
// clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer);
//
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL);
}
template class FetchKernel<GPU_CL, float>;
......
......@@ -37,10 +37,10 @@ void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
int dims[4] = {1, 1, 1, 1};
int odims[4] = {1, 1, 1, 1};
for (int i = 0; i < inputDim.size(); i++) {
dims[4-inputDim.size()+i] = inputDim[i];
dims[4 - inputDim.size() + i] = inputDim[i];
}
for (int i = 0; i < outputDim.size(); i++) {
odims[4-outputDim.size()+i] = outputDim[i];
odims[4 - outputDim.size() + i] = outputDim[i];
}
clSetKernelArg(kernel, 2, sizeof(cl_int), &dims);
clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]);
......
......@@ -17,7 +17,7 @@ shift
perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
(
# remove clang format ios_io folder
flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||')
flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||')
clang-format -i $flist
)
perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册