imp some function

819523a8 · yangfei · fe7f8763 · 819523a8 · 819523a8 · 819523a8
12 changed file
--- a/src/framework/cl/cl_image.cpp
+++ b/src/framework/cl/cl_image.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "cl_image.h"
+namespace paddle_mobile {
+    namespace framework {
+        void CLImageToTensor(CLImage *cl_image, Tensor *tensor,cl_command_queue commandQueue){
+            DDim ddim = cl_image->dims();
+            size_t N,C,H,W;
+            if(ddim.size()==4){
+                N = ddim[0];
+                if(N<0){
+                    N = 1;
+                }
+                C = ddim[1];
+                H = ddim[2];
+                W = ddim[3];
+            }else if(ddim.size()==1){
+                N = 1;
+                C = ddim[0];
+                H = 1;
+                W = 1;
+            }
+            size_t width = W * ((C + 3) / 4);
+            size_t height = H * N;
+            float *p = tensor->data<float>();
+            half imageData[width * height * 4];
+            cl_int err;
+            cl_mem image = cl_image->GetCLImage();
+            size_t origin[3] = {0,0,0};
+            size_t region[3] = {width,height,1};
+            err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
+                size_t i0 = 0;
+                for (int n = 0; n < N; n++) {
+                    for (int c = 0; c < C; c++) {
+                        size_t i1 = i0;
+                        for (int h = 0; h < H; h++) {
+                            size_t i2 = (i1<<2) + c % 4;
+                            for (int w = 0; w < W; w++) {
+                                *p = half2float(imageData[i2]);
+                                i2 += 4;
+                                p++;
+                            }
+                            i1 += width;
+                        }
+                    }
+                    i0 += width * H;
+                }
+                if (err != CL_SUCCESS) {
+                    // TODO: error handling
+                }
+        }
+        void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,cl_command_queue commandQueue){
+            DDim ddim = cl_image->dims();
+            size_t N,C,H,W;
+            if(ddim.size()==4){
+                N = ddim[0];
+                if(N<0){
+                    N = 1;
+                }
+                C = ddim[1];
+                H = ddim[2];
+                W = ddim[3];
+            }else if(ddim.size()==1){
+                N = 1;
+                C = ddim[0];
+                H = 1;
+                W = 1;
+            }
+            size_t width = W * ((C + 3) / 4);
+            size_t height = H * N;
+            const float *p = tensor->data<float>();
+            half imageData[width * height * 4];
+            cl_mem image = cl_image->GetCLImage();
+            size_t origin[3] = {0,0,0};
+            size_t region[3] = {width,height,1};
+            cl_int err;
+            err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
+            if (err != CL_SUCCESS) {
+                // TODO: error handling
+            }
+                size_t i0 = 0;
+                for (int n = 0; n < N; n++) {
+                    for (int c = 0; c < C; c++) {
+                        size_t i1 = i0;
+                        for (int h = 0; h < H; h++) {
+                            size_t i2 = (i1<<2) + c % 4;
+                            for (int w = 0; w < W; w++) {
+                                imageData[i2] = float2half(*p);
+                                i2 += 4;
+                                p++;
+                            }
+                            i1 += width;
+                        }
+                    }
+                    i0 += width * H;
+                }
+        }
+    }
+}
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -57,8 +57,6 @@ class CLImage {
      height_of_one_block_ = H;
    }
-    DLOG << "-------InitMemory-------";
    size_t width = W * ((C + 3) / 4);
    size_t height = H * N;
@@ -95,7 +93,6 @@ class CLImage {
        i0 += width * H;
      }
    }
-    DLOG << "-------InitMemory-------";
    cl_int err;
    cl_image_ = clCreateImage2D(
        context,                                   // cl_context context
@@ -156,13 +153,9 @@ class CLImage {
  cl_context context_;
 };
-// void TensorToCLImage(Tensor *tensor, CLImage *image) {
+void TensorToCLImage(Tensor *tensor, CLImage *image);
-//
-//}
+void CLImageToTensor(CLImage *image, Tensor *tensor);
-//
-// void CLImageToTensor(CLImage *image, Tensor *tensor) {
-//
-//}
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
@@ -53,8 +53,8 @@ class CLScope {
      return it->second.get();
    }
-    auto program =
+    auto program = CLEngine::Instance()->CreateProgramWith(
-        CLEngine::Instance()->CreateProgramWith(context_.get(), "./cl_kernel/" + file_name);
+        context_.get(), "./cl_kernel/" + file_name);
    programs_[file_name] = std::move(program);
    status_ = clBuildProgram(program.get(), 0, 0, 0, 0, 0);

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -938,9 +938,9 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          const framework::TensorDesc &desc = var_desc->Tensor_desc();
-          framework::DDim ddim = framework::make_ddim(desc.Dims());
+          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
+          framework::DDim ddim = cl_image->dims();
          DLOG << var_desc->Name();
          cl_image->Init(context, ddim);
        }
      }
@@ -989,7 +989,8 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
        cl_context context = program_.scope->GetCLScpoe()->Context();
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
-        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        framework::DDim ddim = cl_image->dims();
+        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
        cl_image->Init(context, ddim);
      }

--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -66,8 +66,26 @@ void OperatorBase<Dtype>::Run() {
    for (int i = 0; i < var_vec_in.size(); ++i) {
      auto vari = scope_->FindVar(var_vec_in[i]);
      if (vari->IsInitialized()) {
+#ifdef PADDLE_MOBILE_CL
+        if (type_ == "feed") {
+          Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+          if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+        } else {
+          CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
+          //              cl_command_queue commandQueue =
+          //              scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ;
+          //              CLImageToTensor(cl_image,tmp,commandQueue);
+          //              tmp->Resize(cl_image->dims());
+          if (cl_image) {
+            //                  DLOG<<type_<<" input- "<<key<<"="<<*tmp;
+            DLOG << type_ << " input- " << key << "=" << cl_image->dims();
+          }
+        }
+#else
        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+#endif
      }
    }
  }
@@ -76,8 +94,20 @@ void OperatorBase<Dtype>::Run() {
    for (int i = 0; i < var_vec_out.size(); ++i) {
      auto vari = scope_->FindVar(var_vec_out[i]);
      if (vari->IsInitialized()) {
+#ifdef PADDLE_MOBILE_CL
+        CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
+        //          cl_command_queue commandQueue =
+        //          scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ;
+        //          CLImageToTensor(cl_image,tmp,commandQueue);
+        //          tmp->Resize(cl_image->dims());
+        if (cl_image) {
+          //              DLOG<<type_<<" output- "<<key<<"="<<*tmp;
+          DLOG << type_ << " output- " << key << "=" << cl_image->dims();
+        }
+#else
        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+#endif
      }
    }
  }

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -14,7 +14,21 @@ limitations under the License. */
 #include "feed_op.h"
 namespace paddle_mobile {
-namespace operators {}
+namespace operators {
+#ifdef PADDLE_MOBILE_CL
+template <typename DeviceType, typename T>
+void FeedOp<DeviceType, T>::InferShape() const {
+  auto out_dims = this->param_.Out()->dims();
+  out_dims[0] = this->param_.BatchSize();
+  this->param_.Out()->Resize(out_dims);
+}
+template <typename DeviceType, typename T>
+void FeedOp<DeviceType, T>::RunImpl() {
+  this->kernel_.Compute(this->param_);
+}
+#endif
+}
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -16,11 +16,32 @@ limitations under the License. */
 #include <string>
 #include "framework/operator.h"
+#include "operators/kernel/feed_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
 using std::string;
+#ifdef PADDLE_MOBILE_CL
+template <typename DeviceType, typename T>
+class FeedOp
+    : public framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
+                                           FeedKernel<DeviceType, T>> {
+ public:
+  FeedOp(const string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const framework::AttributeMap attrs,
+         std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
+                                      FeedKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+  void RunImpl() override;
+ protected:
+};
+#else
 template <typename DeviceType, typename T>
 class FeedOp : public framework::OperatorBase<DeviceType> {
 public:
@@ -74,22 +95,20 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
  }
 #else
-#ifdef PADDLE_MOBILE_CL
-  void Init() {}
-  void RunImpl() {}
-#else
  void Init() {}
  void RunImpl() {
-    param_.Out()->ShareDataWith(*param_.InputX());
+                    param_.Out()->ShareDataWith(*param_.InputX());
-    param_.Out()->set_lod(param_.InputX()->lod());
+                    param_.Out()->set_lod(param_.InputX()->lod());
  }
-#endif
-#endif
 protected:
  FeedParam<DeviceType> param_;
 };
+#endif
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+__kernel void feed(__global float* in, __write_only image2d_t outputImage,int h,int w)
+ {
+     int j = get_global_id(0);
+     int i = get_global_id(1);
+     float4 pixel;
+     pixel.x = in[(i * w + j)];
+     pixel.y = in[h * w + (i * w + j)];
+     pixel.z = in[2 * h * w + (i * w + j)];
+     pixel.w = 0;
+     int2 coords;
+     coords.x = j;
+     coords.y = i;
+     write_imagef(outputImage,coords,pixel);
+ }
--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -21,61 +21,58 @@ namespace operators {
 template <>
 bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
+  //  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+  //      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
+  //          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
+  //      "need equal");
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+  //  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
+  //               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
+  //  param->SetOffset(offset);
+  //
-  if (param->Filter()->WidthOfOneBlock() == 1 &&
+  //  if (param->Filter()->WidthOfOneBlock() == 1 &&
-      param->Filter()->HeightOfOneBlock() == 1) {
+  //      param->Filter()->HeightOfOneBlock() == 1) {
-    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+  //    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
-  } else if (param->Filter()->dims()[1] == 1) {
+  //  } else if (param->Filter()->dims()[1] == 1) {
-    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  //    this->cl_helper_.AddKernel("depth_conv_3x3",
-  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
+  //    "conv_add_bn_relu_kernel.cl");
-             param->Filter()->HeightOfOneBlock() == 3) {
+  //  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
-    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+  //             param->Filter()->HeightOfOneBlock() == 3) {
-  } else {
+  //    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  //  } else {
-  }
+  //    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  //  }
  return true;
 }
 template <>
 void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
+  //  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  //  auto default_work_size =
-  int c_block = default_work_size[0];
+  //  this->cl_helper_.DefaultWorkSize(*param.Output()); int c_block =
-  int w = default_work_size[1];
+  //  default_work_size[0]; int w = default_work_size[1]; int nh =
-  int nh = default_work_size[2];
+  //  default_work_size[2]; auto input = param.Input()->GetCLImage(); auto
-  auto input = param.Input()->GetCLImage();
+  //  filter = param.Filter()->GetCLImage(); auto output = param.Output(); int
-  auto filter = param.Filter()->GetCLImage();
+  //  stride = param.Strides()[0]; int offset = param.Offset(); int input_c =
-  auto output = param.Output();
+  //  param.Input()->CBlock(); int dilation = param.Dilations()[0]; int
-  int stride = param.Strides()[0];
+  //  input_width = param.Input()->WidthOfOneBlock(); int input_height =
-  int offset = param.Offset();
+  //  param.Input()->HeightOfOneBlock();
-  int input_c = param.Input()->CBlock();
+  //
-  int dilation = param.Dilations()[0];
+  //  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  int input_width = param.Input()->WidthOfOneBlock();
+  //  clSetKernelArg(kernel, 1, sizeof(int), &w);
-  int input_height = param.Input()->HeightOfOneBlock();
+  //  clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  //  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  //  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  clSetKernelArg(kernel, 1, sizeof(int), &w);
+  //  clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-  clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  //  clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  //  clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  //  clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  //  clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  //  clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  //  clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  //
-  clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-  clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  //                         default_work_size.data(), NULL, 0, NULL, NULL);
-  clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                         default_work_size.data(), NULL, 0, NULL, NULL);
  //  auto kernel = this->cl_helper_.KernelAt(0);
  //  size_t global_work_size[3] = {1, 2, 3};

--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/feed_kernel.h"
+namespace paddle_mobile {
+    namespace operators {
+        template <>
+        bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
+            DLOG<<"Init feed";
+         this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
+            return true;
+        }
+        template <>
+        void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
+         DLOG<<"feed_kernel";
+         auto kernel = this->cl_helper_.KernelAt(0);
+         cl_int status;
+         auto output = param.Out();
+         auto input = param.InputX();
+         const float *input_data = input->data<float>();
+         cl_mem cl_image = output->GetCLImage();
+         int height = output->dims()[2];
+         int width = output->dims()[3];
+         status = clSetKernelArg(kernel,0, sizeof(cl_mem),&input_data);
+         status = clSetKernelArg(kernel,0, sizeof(cl_mem),&cl_image);
+         status = clSetKernelArg(kernel,0, sizeof(cl_mem),&width);
+         status = clSetKernelArg(kernel,0, sizeof(cl_mem),&height);
+         size_t global_work_size[2] = {height,width};
+         clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+        }
+        template class FeedKernel<GPU_CL, float>;
+    }  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/feed_kernel.h
+++ b/src/operators/kernel/feed_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+    namespace operators {
+        using namespace framework;
+        template <typename DeviceType, typename T>
+        class FeedKernel
+                : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>>{
+        public:
+            void Compute(const FeedParam<DeviceType> &param);
+            bool Init(FeedParam<DeviceType> *param);
+        };
+    }  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -911,13 +911,13 @@ class FeedParam : public OpParam {
 public:
  FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope) {
+            const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<LoDTensor>(inputs, *scope);
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<GType>(outputs, *scope);
+    out_ = OutFrom<GType>(outputs, scope);
-    auto var = scope->Var("batch_size");
+    auto var = scope.FindVar("batch_size");
    batch_size = var->GetValue<int>();
  }
-  const GType *InputX() const { return input_x_; }
+  const LoDTensor *InputX() const { return input_x_; }
  GType *Out() const { return out_; }
  const int BatchSize() const { return batch_size; }