imp feed op

f3fe2493 · yangfei · 7fb2e727 · f3fe2493 · f3fe2493
隐藏空白更改
内联并排

Showing with 4 addition and 10 deletion

src/framework/cl/cl_tensor.h src/framework/cl/cl_tensor.h +2 -2

src/operators/kernel/cl/feed_kernel.cpp src/operators/kernel/cl/feed_kernel.cpp +2 -8

未找到文件。
--- a/src/framework/cl/cl_tensor.h
+++ b/src/framework/cl/cl_tensor.h
@@ -37,10 +37,10 @@ class CLTensor : TensorBase {
  }

  template <typename T>
-  inline T *mutable_with_data(void *data) {
+  inline T mutable_with_data(void *data) {
    int64_t size = numel() * sizeof(float);
    holder_.reset(new PlaceholderImpl(size, data, typeid(T), context_));
-    return reinterpret_cast<T *>(
+    return reinterpret_cast<T>(
        reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(holder_->ptr())));
  }


--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -32,16 +32,13 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
  const Tensor *input = param.InputX();
  const float *input_data = input->data<float>();
  int numel = input->numel();
-  DLOG << "numel = " << numel;
  cl_mem cl_image = output->GetCLImage();
  int height = output->dims()[2];
  int width = output->dims()[3];
-  DLOG << output->dims();
  CLTensor input_cl_tensor(this->cl_helper_.CLContext());
  input_cl_tensor.Resize(input->dims());
-  cl_mem *inputBuffer =
+  cl_mem inputBuffer =
      input_cl_tensor.mutable_with_data<cl_mem>((void *)input_data);
-  DLOG << "yangfei";

  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
  CL_CHECK_ERRORS(status);
@@ -53,21 +50,18 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
  CL_CHECK_ERRORS(status);

  size_t global_work_size[2] = {height, width};
-  DLOG << "yangfei";
  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                  NULL, global_work_size, NULL, 0, NULL, NULL);
  CL_CHECK_ERRORS(status);

  int len = 4 * 224 * 224;
  half *out = new half[len];
-  DLOG << "yangfei";
  cl_command_queue commandQueue = this->cl_helper_.CLCommandQueue();
  size_t origin[3] = {0, 0, 0};
  size_t region[3] = {height, width, 1};
  clEnqueueReadImage(commandQueue, cl_image, CL_TRUE, origin, region, 0, 0, out,
                     0, NULL, NULL);
-  DLOG << "yangfei";
-  for (int i = 0; i < 100; i++) DLOG << out[i];
+  for (int i = 0; i < numel; i++) DLOG << Half2Float(out[i]);
 }

 template class FeedKernel<GPU_CL, float>;