提交 7fbea0e2 编写于 作者: Y yangfei

add some function

上级 49bce554
......@@ -14,20 +14,20 @@ limitations under the License. */
#include "cl_image.h"
namespace paddle_mobile {
namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,cl_command_queue commandQueue){
namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
cl_command_queue commandQueue) {
DDim ddim = cl_image->dims();
size_t N,C,H,W;
if(ddim.size()==4){
size_t N, C, H, W;
if (ddim.size() == 4) {
N = ddim[0];
if(N<0){
if (N < 0) {
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
}else if(ddim.size()==1){
} else if (ddim.size() == 1) {
N = 1;
C = ddim[0];
H = 1;
......@@ -41,15 +41,16 @@ namespace paddle_mobile {
half imageData[width * height * 4];
cl_int err;
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0,0,0};
size_t region[3] = {width,height,1};
err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1<<2) + c % 4;
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = half2float(imageData[i2]);
i2 += 4;
......@@ -61,25 +62,23 @@ namespace paddle_mobile {
i0 += width * H;
}
if (err != CL_SUCCESS) {
// TODO: error handling
}
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,cl_command_queue commandQueue){
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
cl_command_queue commandQueue) {
DDim ddim = cl_image->dims();
size_t N,C,H,W;
if(ddim.size()==4){
size_t N, C, H, W;
if (ddim.size() == 4) {
N = ddim[0];
if(N<0){
if (N < 0) {
N = 1;
}
C = ddim[1];
H = ddim[2];
W = ddim[3];
}else if(ddim.size()==1){
} else if (ddim.size() == 1) {
N = 1;
C = ddim[0];
H = 1;
......@@ -92,10 +91,11 @@ namespace paddle_mobile {
const float *p = tensor->data<float>();
half imageData[width * height * 4];
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0,0,0};
size_t region[3] = {width,height,1};
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
cl_int err;
err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL);
if (err != CL_SUCCESS) {
// TODO: error handling
}
......@@ -104,7 +104,7 @@ namespace paddle_mobile {
for (int c = 0; c < C; c++) {
size_t i1 = i0;
for (int h = 0; h < H; h++) {
size_t i2 = (i1<<2) + c % 4;
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
imageData[i2] = float2half(*p);
i2 += 4;
......@@ -115,9 +115,6 @@ namespace paddle_mobile {
}
i0 += width * H;
}
}
}
}
} // namespace framework
} // namespace paddle_mobile
......@@ -30,6 +30,20 @@ class CLImage {
void Init(cl_context context, float *tensorInput, DDim ddim) {
tensor_dims_ = ddim;
if (tensorInput) {
tensor_input_ = tensorInput;
} else {
int numel = 1;
for (int i = 0; i < ddim.size(); i++) {
numel *= ddim[i];
}
tensor_input_ = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * numel));
for (int i = 0; i < numel; i++) {
tensor_input_[i] = 0;
}
}
cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT};
// NCHW -> [W * (C+3)/4, H * N]
......@@ -65,9 +79,9 @@ class CLImage {
std::unique_ptr<half_t[]> imageData{};
int count = 0;
if (tensorInput != nullptr) {
imageData.reset(new half_t[width * height * 4]);
float *p = tensorInput;
if (tensor_input_ != nullptr) {
float *p = tensor_input_;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
......@@ -75,11 +89,13 @@ class CLImage {
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
if (i2 >= width * height * 4) {
printf("%d > %d ----> %d, %d, %d, %d --- %d, %d, %d\n", i2,
width * height * 4, n, c, h, w, i0, i1, i2);
}
assert(i2 < width * height * 4);
// if (i2 >= width * height * 4) {
// printf("%d > %d ----> %d, %d, %d, %d --- %d, %d,
// %d\n", i2,
// width * height * 4, n, c, h, w, i0, i1,
// i2);
// }
// assert(i2 < width * height * 4);
imageData[i2] = float2half(*p);
i2 += 4;
......@@ -153,9 +169,11 @@ class CLImage {
cl_context context_;
};
void TensorToCLImage(Tensor *tensor, CLImage *image);
void TensorToCLImage(Tensor *tensor, CLImage *image,
cl_command_queue commandQueue);
void CLImageToTensor(CLImage *image, Tensor *tensor);
void CLImageToTensor(CLImage *image, Tensor *tensor,
cl_command_queue commandQueue);
} // namespace framework
} // namespace paddle_mobile
......@@ -56,7 +56,8 @@ class CLScope {
auto program = CLEngine::Instance()->CreateProgramWith(
context_.get(), "./cl_kernel/" + file_name);
status_ = clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
status_ =
clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
CL_CHECK_ERRORS(status_);
programs_[file_name] = std::move(program);
......
......@@ -931,7 +931,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
cl_image->Init(context, tensorInput, ddim);
delete origin_data;
paddle_mobile::memory::Free(tensorInput);
// paddle_mobile::memory::Free(tensorInput);
} else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
auto cl_image = var->template GetMutable<framework::CLImage>();
......
......@@ -73,12 +73,15 @@ void OperatorBase<Dtype>::Run() {
} else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
// cl_command_queue commandQueue =
// scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ;
// scope_->GetCLScpoe()->CommandQueue(); Tensor
// *tmp ;
// CLImageToTensor(cl_image,tmp,commandQueue);
// tmp->Resize(cl_image->dims());
const float *input = cl_image->data<float>();
if (cl_image) {
// DLOG<<type_<<" input- "<<key<<"="<<*tmp;
DLOG << type_ << " input- " << key << "=" << cl_image->dims();
// if(input)
// DLOG<<type_<<" input- "<<key<<"="<<*input;
}
}
......@@ -95,15 +98,24 @@ void OperatorBase<Dtype>::Run() {
auto vari = scope_->FindVar(var_vec_out[i]);
if (vari->IsInitialized()) {
#ifdef PADDLE_MOBILE_CL
if (type_ == "fetch") {
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor)
DLOG << type_ << " output- " << key << "=" << tensor->dims();
} else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
// cl_command_queue commandQueue =
// scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ;
// CLImageToTensor(cl_image,tmp,commandQueue);
// tmp->Resize(cl_image->dims());
if (cl_image) {
// DLOG<<type_<<" output- "<<key<<"="<<*tmp;
const float *output = cl_image->data<float>();
DLOG << type_ << " output- " << key << "=" << cl_image->dims();
// if(output)
// DLOG<<type_<<" output- "<<key<<"="<<*output;
}
}
#else
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
......
......@@ -18,9 +18,10 @@ limitations under the License. */
inline hafl4 activation(half4 in
#ifdef PRELU
,half4 prelu_alpha
,
half4 prelu_alpha
#endif
) {
) {
half4 output;
#ifdef PRELU
output = select(prelu_alpha * in, in, in >= (half4)0.0);
......@@ -31,4 +32,3 @@ inline hafl4 activation(half4 in
#endif
return output;
}
......@@ -36,7 +36,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
}
template <>
void DepthwiseConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
void DepthwiseConvKernel<GPU_CL, float>::Compute(
const ConvParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
......
......@@ -12,42 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "common/log.h"
#include "operators/kernel/feed_kernel.h"
#include "common/log.h"
namespace paddle_mobile {
namespace operators {
namespace operators {
template <>
bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
DLOG<<"Init feed";
template <>
bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
DLOG << "Init feed";
this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
return true;
}
template <>
void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
}
DLOG<<"feed_kernel";
template <>
void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
cl_int status;
auto output = param.Out();
auto input = param.InputX();
const float *input_data = input->data<float>();
const Tensor *input = param.InputX();
const float *input_data = nullptr;
input_data = input->data<float>();
cl_mem cl_image = output->GetCLImage();
int height = output->dims()[2];
int width = output->dims()[3];
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&input_data);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&cl_image);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&width);
status = clSetKernelArg(kernel,0, sizeof(cl_mem),&height);
DLOG << output->dims();
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height);
size_t global_work_size[2] = {height,width};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
}
size_t global_work_size[2] = {height, width};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
global_work_size, NULL, 0, NULL, NULL);
}
template class FeedKernel<GPU_CL, float>;
template class FeedKernel<GPU_CL, float>;
} // namespace operators
} // namespace operators
} // namespace paddle_mobile
......@@ -18,15 +18,15 @@ limitations under the License. */
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using namespace framework;
template <typename DeviceType, typename T>
class FeedKernel
: public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>>{
namespace operators {
using namespace framework;
template <typename DeviceType, typename T>
class FeedKernel
: public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
public:
void Compute(const FeedParam<DeviceType> &param);
bool Init(FeedParam<DeviceType> *param);
};
};
} // namespace operators
} // namespace operators
} // namespace paddle_mobile
......@@ -936,14 +936,14 @@ class FetchParam : public OpParam {
FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
out_ = OutFrom<LoDTensor>(outputs, scope);
}
const RType *InputX() const { return input_x_; }
RType *Out() const { return out_; }
Tensor *Out() const { return out_; }
private:
RType *input_x_;
RType *out_;
Tensor *out_;
};
#ifdef TRANSPOSE_OP
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册