提交 1a8af5ef 编写于 作者: Y yangfei

imp some function

上级 70719c56
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "cl_image.h"
namespace paddle_mobile {
namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,cl_command_queue commandQueue){
DDim ddim = cl_image->dims();
size_t N,C,H,W;
if(ddim.size()==4){
N = ddim[0];
if(N<0){
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
}else if(ddim.size()==1){
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_int err;
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0,0,0};
size_t region[3] = {width,height,1};
err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1<<2) + c % 4;
for (int w = 0; w < W; w++) {
*p = half2float(imageData[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
if (err != CL_SUCCESS) {
// TODO: error handling
}
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,cl_command_queue commandQueue){
DDim ddim = cl_image->dims();
size_t N,C,H,W;
if(ddim.size()==4){
N = ddim[0];
if(N<0){
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
}else if(ddim.size()==1){
N = 1;
C = ddim[0];
H = 1;
W = 1;
}
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
const float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0,0,0};
size_t region[3] = {width,height,1};
cl_int err;
err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
if (err != CL_SUCCESS) {
// TODO: error handling
}
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1<<2) + c % 4;
for (int w = 0; w < W; w++) {
imageData[i2] = float2half(*p);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
}
}
}
......@@ -57,8 +57,6 @@ class CLImage {
height_of_one_block_ = H;
}
DLOG << "-------InitMemory-------";
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
......@@ -95,7 +93,6 @@ class CLImage {
i0 += width * H;
}
}
DLOG << "-------InitMemory-------";
cl_int err;
cl_image_ = clCreateImage2D(
context, // cl_context context
......@@ -156,13 +153,9 @@ class CLImage {
cl_context context_;
};
// void TensorToCLImage(Tensor *tensor, CLImage *image) {
//
//}
//
// void CLImageToTensor(CLImage *image, Tensor *tensor) {
//
//}
void TensorToCLImage(Tensor *tensor, CLImage *image);
void CLImageToTensor(CLImage *image, Tensor *tensor);
} // namespace framework
} // namespace paddle_mobile
......@@ -53,8 +53,8 @@ class CLScope {
return it->second.get();
}
auto program =
CLEngine::Instance()->CreateProgramWith(context_.get(), "./cl_kernel/" + file_name);
auto program = CLEngine::Instance()->CreateProgramWith(
context_.get(), "./cl_kernel/" + file_name);
programs_[file_name] = std::move(program);
status_ = clBuildProgram(program.get(), 0, 0, 0, 0, 0);
......
......@@ -938,9 +938,9 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
cl_context context = program_.scope->GetCLScpoe()->Context();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = framework::make_ddim(desc.Dims());
// framework::DDim ddim = framework::make_ddim(desc.Dims());
framework::DDim ddim = cl_image->dims();
DLOG << var_desc->Name();
cl_image->Init(context, ddim);
}
}
......@@ -989,7 +989,8 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
cl_context context = program_.scope->GetCLScpoe()->Context();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = framework::make_ddim(desc.Dims());
framework::DDim ddim = cl_image->dims();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->Init(context, ddim);
}
......
......@@ -66,8 +66,26 @@ void OperatorBase<Dtype>::Run() {
for (int i = 0; i < var_vec_in.size(); ++i) {
auto vari = scope_->FindVar(var_vec_in[i]);
if (vari->IsInitialized()) {
#ifdef PADDLE_MOBILE_CL
if (type_ == "feed") {
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
} else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
// cl_command_queue commandQueue =
// scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ;
// CLImageToTensor(cl_image,tmp,commandQueue);
// tmp->Resize(cl_image->dims());
if (cl_image) {
// DLOG<<type_<<" input- "<<key<<"="<<*tmp;
DLOG << type_ << " input- " << key << "=" << cl_image->dims();
}
}
#else
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
#endif
}
}
}
......@@ -76,8 +94,20 @@ void OperatorBase<Dtype>::Run() {
for (int i = 0; i < var_vec_out.size(); ++i) {
auto vari = scope_->FindVar(var_vec_out[i]);
if (vari->IsInitialized()) {
#ifdef PADDLE_MOBILE_CL
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
// cl_command_queue commandQueue =
// scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ;
// CLImageToTensor(cl_image,tmp,commandQueue);
// tmp->Resize(cl_image->dims());
if (cl_image) {
// DLOG<<type_<<" output- "<<key<<"="<<*tmp;
DLOG << type_ << " output- " << key << "=" << cl_image->dims();
}
#else
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
#endif
}
}
}
......
......@@ -14,7 +14,21 @@ limitations under the License. */
#include "feed_op.h"
namespace paddle_mobile {
namespace operators {}
namespace operators {
#ifdef PADDLE_MOBILE_CL
template <typename DeviceType, typename T>
void FeedOp<DeviceType, T>::InferShape() const {
auto out_dims = this->param_.Out()->dims();
out_dims[0] = this->param_.BatchSize();
this->param_.Out()->Resize(out_dims);
}
template <typename DeviceType, typename T>
void FeedOp<DeviceType, T>::RunImpl() {
this->kernel_.Compute(this->param_);
}
#endif
}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
......
......@@ -16,11 +16,32 @@ limitations under the License. */
#include <string>
#include "framework/operator.h"
#include "operators/kernel/feed_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using std::string;
#ifdef PADDLE_MOBILE_CL
template <typename DeviceType, typename T>
class FeedOp
: public framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
FeedKernel<DeviceType, T>> {
public:
FeedOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
FeedKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
void RunImpl() override;
protected:
};
#else
template <typename DeviceType, typename T>
class FeedOp : public framework::OperatorBase<DeviceType> {
public:
......@@ -74,22 +95,20 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
}
#else
#ifdef PADDLE_MOBILE_CL
void Init() {}
void RunImpl() {}
#else
void Init() {}
void RunImpl() {
param_.Out()->ShareDataWith(*param_.InputX());
param_.Out()->set_lod(param_.InputX()->lod());
param_.Out()->ShareDataWith(*param_.InputX());
param_.Out()->set_lod(param_.InputX()->lod());
}
#endif
#endif
protected:
FeedParam<DeviceType> param_;
};
#endif
#endif
} // namespace operators
} // namespace paddle_mobile
......
__kernel void feed(__global float* in, __write_only image2d_t outputImage,int h,int w)
{
int j = get_global_id(0);
int i = get_global_id(1);
float4 pixel;
pixel.x = in[(i * w + j)];
pixel.y = in[h * w + (i * w + j)];
pixel.z = in[2 * h * w + (i * w + j)];
pixel.w = 0;
int2 coords;
coords.x = j;
coords.y = i;
write_imagef(outputImage,coords,pixel);
}
......@@ -21,61 +21,58 @@ namespace operators {
template <>
bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
PADDLE_MOBILE_ENFORCE(
param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1],
"need equal");
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset);
if (param->Filter()->WidthOfOneBlock() == 1 &&
param->Filter()->HeightOfOneBlock() == 1) {
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->dims()[1] == 1) {
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
} else if (param->Filter()->WidthOfOneBlock() == 3 &&
param->Filter()->HeightOfOneBlock() == 3) {
this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
}
// PADDLE_MOBILE_ENFORCE(
// param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
// param->Paddings()[0] == param->Paddings()[1],
// "need equal");
// int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
// static_cast<int>(param->Paddings()[1]);
// param->SetOffset(offset);
//
// if (param->Filter()->WidthOfOneBlock() == 1 &&
// param->Filter()->HeightOfOneBlock() == 1) {
// this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
// } else if (param->Filter()->dims()[1] == 1) {
// this->cl_helper_.AddKernel("depth_conv_3x3",
// "conv_add_bn_relu_kernel.cl");
// } else if (param->Filter()->WidthOfOneBlock() == 3 &&
// param->Filter()->HeightOfOneBlock() == 3) {
// this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
// } else {
// PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
// }
return true;
}
template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
clSetKernelArg(kernel, 0, sizeof(int), &c_block);
clSetKernelArg(kernel, 1, sizeof(int), &w);
clSetKernelArg(kernel, 2, sizeof(int), &nh);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 6, sizeof(int), &stride);
clSetKernelArg(kernel, 7, sizeof(int), &offset);
clSetKernelArg(kernel, 8, sizeof(int), &input_c);
clSetKernelArg(kernel, 9, sizeof(int), &dilation);
clSetKernelArg(kernel, 10, sizeof(int), &input_width);
clSetKernelArg(kernel, 11, sizeof(int), &input_height);
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
// auto kernel = this->cl_helper_.KernelAt(0);
// auto default_work_size =
// this->cl_helper_.DefaultWorkSize(*param.Output()); int c_block =
// default_work_size[0]; int w = default_work_size[1]; int nh =
// default_work_size[2]; auto input = param.Input()->GetCLImage(); auto
// filter = param.Filter()->GetCLImage(); auto output = param.Output(); int
// stride = param.Strides()[0]; int offset = param.Offset(); int input_c =
// param.Input()->CBlock(); int dilation = param.Dilations()[0]; int
// input_width = param.Input()->WidthOfOneBlock(); int input_height =
// param.Input()->HeightOfOneBlock();
//
// clSetKernelArg(kernel, 0, sizeof(int), &c_block);
// clSetKernelArg(kernel, 1, sizeof(int), &w);
// clSetKernelArg(kernel, 2, sizeof(int), &nh);
// clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
// clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
// clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
// clSetKernelArg(kernel, 6, sizeof(int), &stride);
// clSetKernelArg(kernel, 7, sizeof(int), &offset);
// clSetKernelArg(kernel, 8, sizeof(int), &input_c);
// clSetKernelArg(kernel, 9, sizeof(int), &dilation);
// clSetKernelArg(kernel, 10, sizeof(int), &input_width);
// clSetKernelArg(kernel, 11, sizeof(int), &input_height);
//
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL);
// auto kernel = this->cl_helper_.KernelAt(0);
// size_t global_work_size[3] = {1, 2, 3};
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/feed_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
DLOG<<"Init feed";
this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
return true;
}
template <>
void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
DLOG<<"feed_kernel";
auto kernel = this->cl_helper_.KernelAt(0);
cl_int status;
auto output = param.Out();
auto input = param.InputX();
const float *input_data = input->data<float>();
cl_mem cl_image = output->GetCLImage();
int height = output->dims()[2];
int width = output->dims()[3];
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&input_data);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&cl_image);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&width);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&height);
size_t global_work_size[2] = {height,width};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
}
template class FeedKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using namespace framework;
template <typename DeviceType, typename T>
class FeedKernel
: public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>>{
public:
void Compute(const FeedParam<DeviceType> &param);
bool Init(FeedParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
......@@ -911,13 +911,13 @@ class FeedParam : public OpParam {
public:
FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, Scope *scope) {
input_x_ = InputXFrom<LoDTensor>(inputs, *scope);
out_ = OutFrom<GType>(outputs, *scope);
auto var = scope->Var("batch_size");
const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<LoDTensor>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
auto var = scope.FindVar("batch_size");
batch_size = var->GetValue<int>();
}
const GType *InputX() const { return input_x_; }
const LoDTensor *InputX() const { return input_x_; }
GType *Out() const { return out_; }
const int BatchSize() const { return batch_size; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册