diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp
index 8b0316af4f90803871f09aa3bda737c466390bf9..a999971192ceb01299b3b03846a95ec257de61d3 100644
--- a/src/framework/cl/cl_image.cpp
+++ b/src/framework/cl/cl_image.cpp
@@ -168,9 +168,8 @@ Print &operator<<(Print &printer, const CLImage &cl_image) {
     i0 += width * H;
   }
 
-  if (err != CL_SUCCESS) {
-    CL_CHECK_ERRORS(err);
-  }
+  CL_CHECK_ERRORS(err);
+
   for (int i = 0; i < cl_image.numel(); i += stride) {
     printer << data[i] << " ";
   }
diff --git a/src/framework/cl/cl_tensor.h b/src/framework/cl/cl_tensor.h
index c38091dd39c776254035f9b13c8505d64686915a..1d6829fe4b77639f34df0be37d7a539b91ff4bcc 100644
--- a/src/framework/cl/cl_tensor.h
+++ b/src/framework/cl/cl_tensor.h
@@ -28,7 +28,19 @@ namespace framework {
 
 class CLTensor : TensorBase {
  public:
-  explicit CLTensor(cl_context context) : context_(context) {}
+  CLTensor(cl_context context, cl_command_queue command_queue)
+      : context_(context), command_queue_(command_queue) {}
+
+  CLTensor() = default;
+
+  /*
+   * if init method haven't set context and command_queue, need set
+   * */
+  void SetContextAndCommandQueue(cl_context context,
+                                 cl_command_queue command_queue) {
+    context_ = context;
+    command_queue_ = command_queue;
+  }
 
   /*! Resize the dimensions of the memory block. */
   inline CLTensor &Resize(const DDim &dims) {
@@ -39,7 +51,8 @@ class CLTensor : TensorBase {
   template <typename T>
   inline T mutable_with_data(void *data) {
     int64_t size = numel() * sizeof(float);
-    holder_.reset(new PlaceholderImpl(size, data, typeid(T), context_));
+    holder_.reset(
+        new PlaceholderImpl(size, data, typeid(T), context_, command_queue_));
     return reinterpret_cast<T>(
         reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(holder_->ptr())));
   }
@@ -51,7 +64,7 @@ class CLTensor : TensorBase {
     PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
     int64_t size = numel() * SizeOfType(type);
     if (holder_ == nullptr || holder_->size() < size + offset_) {
-      holder_.reset(new PlaceholderImpl(size, type, context_));
+      holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_));
       offset_ = 0;
     }
     return reinterpret_cast<void *>(
@@ -85,6 +98,7 @@ class CLTensor : TensorBase {
 
  private:
   cl_context context_;
+  cl_command_queue command_queue_;
 
   /*
    *   virtual ~Placeholder() = default;
@@ -99,20 +113,31 @@ class CLTensor : TensorBase {
    * */
   struct PlaceholderImpl : public Placeholder {
     PlaceholderImpl(size_t size, void *input, std::type_index type,
-                    cl_context context)
+                    cl_context context, cl_command_queue command_queue)
         : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                               size, reinterpret_cast<void *>(input), NULL)),
           size_(size),
-          type_(type) {}
+          type_(type),
+          command_queue_(command_queue) {}
 
-    PlaceholderImpl(size_t size, std::type_index type, cl_context context)
+    PlaceholderImpl(size_t size, std::type_index type, cl_context context,
+                    cl_command_queue command_queue)
         : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)),
           size_(size),
-          type_(type) {}
+          type_(type),
+          command_queue_(command_queue) {}
 
     virtual size_t size() const { return size_; }
 
-    virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
+    virtual void *ptr() const {
+      if (host_ptr_) {
+        delete (host_ptr_);
+      }
+      char *host_ptr = new char[size_];
+      clEnqueueReadBuffer(command_queue_, ptr_.get(), CL_TRUE, 0, size_,
+                          host_ptr, 0, NULL, NULL);
+      return static_cast<void *>(host_ptr);
+    }
 
     virtual std::type_index type() const { return type_; }
 
@@ -124,6 +149,17 @@ class CLTensor : TensorBase {
 
     /* the current type of memory */
     std::type_index type_;
+
+    cl_command_queue command_queue_;
+
+    ~PlaceholderImpl() {
+      if (host_ptr_) {
+        delete (host_ptr_);
+      }
+    }
+
+   private:
+    void *host_ptr_;
   };
 };
 
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index 7980a2d1f6e3f46060dd25e5a6bede7c50cf7c8d..80589706f94eb0c2331d5af0049c6d53df8ca876 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "framework/cl/cl_image.h"
 #endif
 
-int debug_to = 3;
+int debug_to = 115;
 
 namespace paddle_mobile {
 namespace framework {
@@ -87,7 +87,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
   for (int i = 0; i < blocks.size(); ++i) {
     std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
     std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < debug_to; ++j) {
+    for (int j = 0; j < ops.size(); ++j) {
       std::shared_ptr<framework::OpDesc> op = ops[j];
       DLOG << "create op: " << j << "  " << op->Type();
       auto op_base = framework::OpRegistry<Dtype>::CreateOp(
@@ -416,7 +416,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
     }
   }
 #else
-  for (int i = 0; i < debug_to; i++) {
+  for (int i = 0; i < ops.size(); i++) {
 #ifdef PADDLE_MOBILE_PROFILE
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
index 7ce60e6d1e9a687a3f6623ff8dd8e07576c02daf..272e130817eda62f71a67e179a57ce63f024bc4d 100644
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -40,6 +40,11 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
   const framework::CLImage *scale = param->InputScale();
   const framework::CLImage *bias = param->InputBias();
   const float epsilon = param->Epsilon();
+  //
+  //  DLOG << " climage mean: " << *mean;
+  //  DLOG << " climage variance: " << *variance;
+  //  DLOG << " climage scale: " << *scale;
+  //  DLOG << " climage bias: " << *bias;
 
   auto mean_ptr = mean->data<float>();
   auto variance_ptr = variance->data<float>();
@@ -67,12 +72,20 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
   new_scale->InitCLImage(this->cl_helper_.CLContext(),
                          cl_helper_.CLCommandQueue());
 
+  DLOG << " climage - y bias: " << *(param->Bias());
+
+  DLOG << " climage - new scale: " << *new_scale;
+
   framework::CLImage *new_bias = new framework::CLImage();
 
   new_bias->SetTensorData(new_bias_ptr, variance->dims());
   new_bias->InitCLImage(this->cl_helper_.CLContext(),
                         cl_helper_.CLCommandQueue());
 
+  DLOG << " climage - new bias: " << *new_bias;
+
+  DLOG << " climage - filter: " << *(param->Filter());
+
   param->SetNewScale(new_scale);
   param->SetNewBias(new_bias);
 
diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp
index f0587d69dfddc31f5fe0c5c215aea53bf75c42ed..0db2b7cc4665ff74d06ca62ba9e77d427d883233 100644
--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -36,7 +36,8 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
   cl_mem cl_image = output->GetCLImage();
   int height = output->dims()[2];
   int width = output->dims()[3];
-  CLTensor input_cl_tensor(this->cl_helper_.CLContext());
+  CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
+                           this->cl_helper_.CLCommandQueue());
   input_cl_tensor.Resize(input->dims());
   cl_mem inputBuffer =
       input_cl_tensor.mutable_with_data<cl_mem>((void *)input_data);