imp some function

7fb2e727 · yangfei · af0be6e0 · 7fb2e727 · 7fb2e727 · 7fb2e727
3 changed file
--- a/src/framework/cl/cl_tensor.h
+++ b/src/framework/cl/cl_tensor.h
@@ -36,13 +36,13 @@ class CLTensor : TensorBase {
    return *this;
  }

-  //  template<typename T>
-  //  inline T *mutable_with_data(void *data) {
-  //    int64_t size = numel() * sizeof(float);
-  //    holder_.reset(new PlaceholderImpl(size, data, typeid(T)));
-  //    return reinterpret_cast<T *>(reinterpret_cast<void *>(
-  //            reinterpret_cast<uintptr_t>(holder_->ptr())));
-  //  }
+  template <typename T>
+  inline T *mutable_with_data(void *data) {
+    int64_t size = numel() * sizeof(float);
+    holder_.reset(new PlaceholderImpl(size, data, typeid(T), context_));
+    return reinterpret_cast<T *>(
+        reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(holder_->ptr())));
+  }

  inline void *mutable_data(std::type_index type) {
    if (holder_ != nullptr) {
@@ -51,7 +51,7 @@ class CLTensor : TensorBase {
    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
    int64_t size = numel() * SizeOfType(type);
    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      holder_.reset(new PlaceholderImpl(size, type));
+      holder_.reset(new PlaceholderImpl(size, type, context_));
      offset_ = 0;
    }
    return reinterpret_cast<void *>(

--- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
-__kernel void feed(__global float* in, __write_only image2d_t outputImage,int h,int w)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w)
 {
-     int j = get_global_id(0);
-     int i = get_global_id(1);
-     float4 pixel;
-     pixel.x = in[(i * w + j)];
-     pixel.y = in[h * w + (i * w + j)];
-     pixel.z = in[2 * h * w + (i * w + j)];
-     pixel.w = 0;
-     int2 coords;
-     coords.x = j;
-     coords.y = i;
+        int j = get_global_id(0);
+        int i = get_global_id(1);
+        half4 pixel;
+        pixel.x = convert_half(in[(i * w + j)]);
+        pixel.y = convert_half(in[h * w + (i * w + j)]);
+        pixel.z = convert_half(in[2 * h * w + (i * w + j)]);
+        pixel.w = 0.0;
+        int2 coords;
+        coords.x = j;
+        coords.y = i;

-     write_imagef(outputImage,coords,pixel);
+        write_imageh(outputImage,coords,pixel);
 }
--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "operators/kernel/feed_kernel.h"
-#include "common/log.h"
-
+#include "framework/cl/cl_tensor.h"
 namespace paddle_mobile {
 namespace operators {

@@ -29,39 +28,46 @@ template <>
 void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
  auto kernel = this->cl_helper_.KernelAt(0);
  cl_int status;
-  DLOG << " feed 0";
  auto output = param.Out();
-  DLOG << " feed 1";
  const Tensor *input = param.InputX();
-  DLOG << " feed 2";
-  const float *input_data = nullptr;
-  DLOG << " feed 3";
-  input_data = input->data<float>();
-  DLOG << " feed 4";
-
+  const float *input_data = input->data<float>();
+  int numel = input->numel();
+  DLOG << "numel = " << numel;
  cl_mem cl_image = output->GetCLImage();
-  DLOG << " feed 5";
-
  int height = output->dims()[2];
  int width = output->dims()[3];
-
  DLOG << output->dims();
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data);
-  CL_CHECK_ERRORS(status);
+  CLTensor input_cl_tensor(this->cl_helper_.CLContext());
+  input_cl_tensor.Resize(input->dims());
+  cl_mem *inputBuffer =
+      input_cl_tensor.mutable_with_data<cl_mem>((void *)input_data);
+  DLOG << "yangfei";

-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image);
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&cl_image);
  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&height);
  CL_CHECK_ERRORS(status);

  size_t global_work_size[2] = {height, width};
+  DLOG << "yangfei";
  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                  NULL, global_work_size, NULL, 0, NULL, NULL);
  CL_CHECK_ERRORS(status);
+
+  int len = 4 * 224 * 224;
+  half *out = new half[len];
+  DLOG << "yangfei";
+  cl_command_queue commandQueue = this->cl_helper_.CLCommandQueue();
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {height, width, 1};
+  clEnqueueReadImage(commandQueue, cl_image, CL_TRUE, origin, region, 0, 0, out,
+                     0, NULL, NULL);
+  DLOG << "yangfei";
+  for (int i = 0; i < 100; i++) DLOG << out[i];
 }

 template class FeedKernel<GPU_CL, float>;