diff --git a/src/framework/cl/cl_tensor.h b/src/framework/cl/cl_tensor.h
index a0ed438f9773dbacaaa7446d594719c0cf12b32e..01fdc7970e772e945ae880cee5bdc2bec589ffab 100644
--- a/src/framework/cl/cl_tensor.h
+++ b/src/framework/cl/cl_tensor.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {
 
-class CLTensor : TensorBase {
+class CLTensor : public TensorBase {
  public:
   CLTensor(cl_context context, cl_command_queue command_queue)
       : context_(context), command_queue_(command_queue) {}
diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp
index c8c94038aa3536431c1fc9dcf982e6714b6484bb..0522905fee91fd466b2c334677acce0d25cfac7e 100644
--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -31,8 +31,6 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
   auto kernel = this->cl_helper_.KernelAt(0);
   auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
   cl_int status;
-  param.Out()->InitEmptyImage(cl_helper_.CLContext(),
-                              cl_helper_.CLCommandQueue(), param.Out()->dims());
   auto output = param.Out();
   const Tensor *input = &param.InputX()->at(col);
   //  DLOG << *input;
diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp
index 4a477f081e89b6fe7b1dbd34ab80cacfea2c21fd..2ce3e928a12ce752236709211ce06a52e3fcd9c3 100644
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
@@ -14,19 +14,13 @@ limitations under the License. */
 
 #include "operators/kernel/fetch_kernel.h"
 #include "framework/cl/cl_tensor.h"
-// #include "common/common.h"
-// #include <iostream>
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
 bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
-  //  if (param->InputX()->dims().size() <= 2) {
-  //    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
-  //  } else {
   this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  //  }
   return true;
 }
 
@@ -40,25 +34,28 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
   auto *out = &param.Out()->at(col);
   out->Resize(param.InputX()->dims());
   out->mutable_data<float>();
-  const auto &dim = param.InputX()->dims();
+
+  DLOG << "fetch kernel out dims = " << out->dims();
+  DLOG << "fetch kernel out memory size = " << out->memory_size();
+
+  auto dim = param.InputX()->dims();
   size_t new_dims[] = {1, 1, 1, 1};
 
   for (int j = 0; j < dim.size(); ++j) {
     new_dims[4 - dim.size() + j] = dim[j];
   }
 
-  size_t C, in_height, in_width;
+  size_t in_ch, in_height, in_width;
 
-  C = new_dims[1];
+  in_ch = new_dims[1];
   in_height = new_dims[2];
-  //  if (dim.size() <= 2) {
-  //    in_width = param.InputX()->ImageWidth();
-  //  } else {
   in_width = new_dims[3];
-  //  }
+  int size_ch = in_height * in_width;
+  int size_block = size_ch * 4;
+  int size_batch = size_ch * in_ch;
 
-  CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
-                         this->cl_helper_.CLCommandQueue());
+  framework::CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
+                                    this->cl_helper_.CLCommandQueue());
   out_cl_tensor.Resize(out->dims());
   cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
 
@@ -66,35 +63,28 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
   clSetKernelArg(kernel, 1, sizeof(int), &in_width);
   clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
   clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  //  if (dim.size() > 2) {
-  int size_ch = in_height * in_width;
-  int size_block = size_ch * 4;
-  int size_batch = size_ch * C;
-  int out_c = new_dims[1];
   clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
   clSetKernelArg(kernel, 5, sizeof(int), &size_block);
   clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
-  clSetKernelArg(kernel, 7, sizeof(int), &out_c);
-  //  }
+  clSetKernelArg(kernel, 7, sizeof(int), &in_ch);
 
   //  cl_event wait_event = param.InpdutX()->GetClEvent();
   clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                          default_work_size.data(), NULL, 0, NULL, NULL);
 
-  //  auto time1 = paddle_mobile::time();
-
   //  printf(" before finish \n");
   //  clFlsh(this->cl_helper_.CLCommandQueue());
   clFinish(this->cl_helper_.CLCommandQueue());
   //  printf(" after finish \n");
 
-  //  auto time2 = paddle_mobile::time();
-  //
-  //
-  //  std::cout << " finish  cost :" << paddle_mobile::time_diff(time1, time2)
-  //            << "ms" << std::endl;
+  DLOG << "fetch kernel out dims = " << out->dims();
+  DLOG << "fetch kernel out memory size = " << out->memory_size();
 
-  memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
+  DLOG << "fetch kernel out_cl_tensor dims = " << out_cl_tensor.dims();
+  DLOG << "fetch kernel out_cl_tensor memery size = "
+       << out_cl_tensor.memory_size();
+  memcpy(out->data<float>(), out_cl_tensor.Data<float>(),
+         sizeof(float) * out->numel());
 }
 
 template class FetchKernel<GPU_CL, float>;
diff --git a/test/net/test_super.cpp b/test/net/test_super.cpp
index dcae08887de02cda30d291801c2696206f0bf84d..6815a886a7e22178c52bc447a99faa910520d817 100644
--- a/test/net/test_super.cpp
+++ b/test/net/test_super.cpp
@@ -21,12 +21,14 @@ int main() {
   paddle_mobile::PaddleMobileConfigInternal config;
   config.load_when_predict = true;
 
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
-  //    paddle_mobile.SetThreadNum(4);
   auto time1 = paddle_mobile::time();
 #ifdef PADDLE_MOBILE_CL
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
   paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#else
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
 #endif
+  //  paddle_mobile.SetThreadNum(4);
 
   auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
                                  std::string(g_super) + "/params", true, false,
@@ -131,12 +133,12 @@ int main() {
       auto time5 = paddle_mobile::time();
       vec_result4 = paddle_mobile.Predict(input4, dims4);
       auto time6 = paddle_mobile::time();
-      std::cout << "224*224 predict cost :第" << i << ": "
+      std::cout << "300*300 predict cost :第" << i << ": "
                 << paddle_mobile::time_diff(time5, time6) << "ms" << std::endl;
     }
 
     auto time4 = paddle_mobile::time();
-    std::cout << "224*224 predict cost :"
+    std::cout << "300*300 predict cost :"
               << paddle_mobile::time_diff(time3, time4) / max << "ms"
               << std::endl;
     //    biggest = std::max_element(std::begin(vec_result4),