diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h
index 75ac9a81ee20afcb4272c8abacd163ccb1e3fee0..0f8046f8f151d53480cf8054763c4a4ec9209ec1 100644
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "CL/cl.h"
 #include "common/enforce.h"
 #include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_tool.h"
 
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h
index baedef23c194e370c9e5f303789a8f6358f644c2..58c8ea6bc4e76dddf15a263878e5270563d2ed57 100644
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -43,17 +43,28 @@ class CLImage {
       C = tensor_dims_[1];
       H = tensor_dims_[2];
       W = tensor_dims_[3];
+
+      width_of_one_block_ = W;
+      height_of_one_block_ = H;
+
     } else if (tensor_dims_.size() == 1) {
       N = 1;
       C = tensor_dims_[0];
       H = 1;
       W = 1;
+
+      width_of_one_block_ = W;
+      height_of_one_block_ = H;
     }
 
     DLOG << "-------InitMemory-------";
 
     size_t width = W * ((C + 3) / 4);
     size_t height = H * N;
+
+    image_width_ = width;
+    image_height_ = height;
+
     std::unique_ptr<half_t[]> imageData{};
     int count = 0;
     if (tensorInput != nullptr) {
@@ -95,9 +106,13 @@ class CLImage {
         0,       // size_t image_row_pitch
         reinterpret_cast<void *>(imageData.get()),  // void *host_ptr
         &err);
+
     if (err != CL_SUCCESS) {
       // TODO(HaiPeng): error handling
+      PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
     }
+
+    initialized_ = true;
   }
 
   void Init(cl_context context, DDim ddim) { Init(context, nullptr, ddim); }
@@ -109,8 +124,6 @@ class CLImage {
 
   const DDim &dims() const { return tensor_dims_; }
 
-  std::vector<size_t> DefaultWorkSize() { return {}; }
-
   cl_mem GetCLImage() const { return cl_image_; }
 
   template <typename T>
@@ -120,24 +133,24 @@ class CLImage {
 
   inline int64_t numel() const { return product(tensor_dims_); }
 
-  int ImageWidth() const { return image_width_; }
+  inline size_t ImageWidth() const { return image_width_; }
 
-  int ImageHeight() const { return image_height_; }
+  inline size_t ImageHeight() const { return image_height_; }
 
-  int CBlock() const { return c_block_; }
+  inline size_t CBlock() const { return c_block_; }
 
-  int WidthOfOneBlock() const { return width_of_one_block_; }
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
 
-  int HeightOfOneBlock() const { return height_of_one_block_; }
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
 
  private:
   bool initialized_ = false;
   cl_mem cl_image_;
-  int image_width_;
-  int width_of_one_block_;
-  int height_of_one_block_;
-  int image_height_;
-  int c_block_;
+  size_t image_width_;
+  size_t width_of_one_block_;
+  size_t height_of_one_block_;
+  size_t image_height_;
+  size_t c_block_;
   DDim tensor_dims_;
   float *tensor_input_;
   cl_context context_;
diff --git a/src/framework/loader.cpp b/src/framework/loader.cpp
index 311f4f1db1b9ed1278190122c20e03f8ef8df9b7..0122f8916f2aa454df7ffe2ebaa8bbc9fe686b7b 100644
--- a/src/framework/loader.cpp
+++ b/src/framework/loader.cpp
@@ -95,15 +95,15 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
  */
 template <typename Dtype, Precision P>
 void FusionAndPrintInfos(
-    bool optimize, bool can_add_split, const Program<Dtype, P> &program,
+    bool optimize, bool can_add_split, Program<Dtype, P> *program,
     const std::shared_ptr<ProgramDesc> &originProgramDesc) {
   if (optimize) {
     ProgramOptimize program_optimize;
-    program.optimizeProgram =
+    program->optimizeProgram =
         program_optimize.FusionOptimize(originProgramDesc, can_add_split);
   }
   if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
+    program->optimizeProgram->Description("optimize: ");
   } else {
     originProgramDesc->Description("program: ");
   }
@@ -186,7 +186,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
   // use  originProgramDesc and scope to init tensors
   InitMemoryFromProgram(originProgramDesc, scope);
   // perform fusion and print infos
-  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
 
   paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
   return program;
@@ -195,7 +195,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
 template <typename Dtype, Precision P>
 const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
     size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    const uint8_t *combined_params_buf, bool optimize, bool quantification) {
+    uint8_t *combined_params_buf, bool optimize, bool quantification) {
   bool can_add_split = false;
 
   PaddleMobile__Framework__Proto__ProgramDesc *c_program;
@@ -221,7 +221,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
   auto scope = std::make_shared<Scope>();
   program.scope = scope;
   InitMemoryFromProgram(originProgramDesc, scope);
-  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
   paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
                                                                nullptr);
   return program;
diff --git a/src/framework/loader.h b/src/framework/loader.h
index ce2ffdb4c27ca8f305c4dfd9b2c0ea987dadb33f..3200f0b25368fa123b80c51000cfd6c6a6d084b6 100644
--- a/src/framework/loader.h
+++ b/src/framework/loader.h
@@ -46,7 +46,7 @@ class Loader {
   const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
                                              const uint8_t *model_buf,
                                              size_t combined_params_len,
-                                             const uint8_t *combined_params_buf,
+                                             uint8_t *combined_params_buf,
                                              bool optimize = false,
                                              bool quantification = false);
 
diff --git a/src/framework/program/program.h b/src/framework/program/program.h
index 192328a567e6d3bfad7a8a3b35e3bc64131a2cd2..ae3e7b0abea2b2e1fc41962dc1d926fed252e096 100644
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -32,7 +32,7 @@ class Program {
   bool combined = false;
   bool quantification = false;
   size_t combined_params_len;
-  const uint8_t *combined_params_buf;
+  uint8_t *combined_params_buf;
 
  private:
 };
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 54a2a4e912266f12c2dd4c232cb3061a7a487bb1..8bea4412ac0b371e029f5aa8914bcb2d6eeb547b 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -68,9 +68,10 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
 }
 
 template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::LoadCombinedMemory(
-    size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-    const uint8_t *combined_params_buf) {
+bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
+                                                const uint8_t *model_buf,
+                                                size_t combined_params_len,
+                                                uint8_t *combined_params_buf) {
   int batch_size = 1;
   bool optimise = true;
   bool quantification = false;
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index 2259fee1a53393407f2b8e0c06d0f94cd39da0fe..b53dc5ac9eb1255a51992c3dd4dbbba3f306c467 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -83,7 +83,7 @@ class PaddleMobile {
    */
   bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
                           size_t combined_params_len,
-                          const uint8_t *combined_params_buf);
+                          uint8_t *combined_params_buf);
 
   void Clear();
 
diff --git a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
index d441932dbf0dc5f079c0bdf13a30c8e5b3215cf7..f731a61a82f9d1e7d44e760037512157c4ffef19 100644
--- a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
@@ -1,3 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 __kernel void elementwise_add(__global float* in, __global float* out) {
      int num = get_global_id(0);
      out[num] = in[num] * 0.1  + 102;
diff --git a/src/operators/kernel/cl/conv_add_bn_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_kernel.cpp
deleted file mode 100644
index f5141015dbc9b5a4d61a71baf42469c95cb0f609..0000000000000000000000000000000000000000
--- a/src/operators/kernel/cl/conv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/kernel/conv_add_bn_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNReluKernel<GPU_CL, float>::Init(
-    FusionConvAddBNReluParam<GPU_CL> *param) {
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<GPU_CL, float>::Compute(
-    const FusionConvAddBNReluParam<GPU_CL> &param) {}
-
-template class ConvAddBNReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
index 9fb7b94927e34d11d0a6f54ee3a755bd88e0b292..e62041d3f47aae8dbc9078d49beb84d45c2d9423 100644
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -37,6 +37,7 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
   auto bias_ptr = bias->data<float>();
 
   const int C = mean->numel();
+
   float inv_std_ptr[C];
   for (int i = 0; i < C; i++) {
     inv_std_ptr[i] =
@@ -55,8 +56,13 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
 
   framework::CLImage *new_scale = new framework::CLImage();
 
+  new_scale->Init(this->cl_helper_.CLContext(), new_scale_ptr,
+                  variance->dims());
+
   framework::CLImage *new_bias = new framework::CLImage();
 
+  new_bias->Init(this->cl_helper_.CLContext(), new_bias_ptr, variance->dims());
+
   param->SetNewScale(new_scale);
 
   param->SetNewBias(new_bias);
@@ -65,10 +71,23 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
       param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
           param->Paddings()[0] == param->Paddings()[1],
       "need equal");
-  param->SetOffset(param->Filter()->dims()[2] / 2 -
-                   static_cast<int>(param->Paddings()[1]));
 
-  this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+
+  param->SetOffset(offset);
+
+  if (param->Filter()->WidthOfOneBlock() == 1 &&
+      param->Filter()->HeightOfOneBlock() == 1) {
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1) {
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
+             param->Filter()->HeightOfOneBlock() == 3) {
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
 
   return true;
 }
diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp
index 0eb4fd61afe78654c1988ab0a0c5144e22531b5f..74de92e4c28709a5fdffa99402b1214982475511 100644
--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -21,12 +21,63 @@ namespace operators {
 
 template <>
 bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  if (param->Filter()->WidthOfOneBlock() == 1 &&
+      param->Filter()->HeightOfOneBlock() == 1) {
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1) {
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
+             param->Filter()->HeightOfOneBlock() == 3) {
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
   return true;
 }
 
 template <>
 void ConvAddKernel<GPU_CL, float>::Compute(
-    const FusionConvAddParam<GPU_CL> &param) {}
+    const FusionConvAddParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto biase = param.Bias()->GetCLImage();
+  auto output = param.Output();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = param.Input()->CBlock();
+  int input_width = param.Input()->WidthOfOneBlock();
+  int input_height = param.Input()->HeightOfOneBlock();
+
+  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  clSetKernelArg(kernel, 1, sizeof(int), &w);
+  clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+  clSetKernelArg(kernel, 9, sizeof(int), &stride);
+  clSetKernelArg(kernel, 10, sizeof(int), &offset);
+  clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+  clSetKernelArg(kernel, 12, sizeof(int), &input_width);
+  clSetKernelArg(kernel, 13, sizeof(int), &input_height);
+
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+}
 
 template class ConvAddKernel<GPU_CL, float>;
 
diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp
index 1e56b29ef9e9aa5bcc1f40ca3c045f92b6d0c3be..ec265b7992cd62fd4f77399698c377570c2b7a61 100644
--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -21,12 +21,62 @@ namespace operators {
 
 template <>
 bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
-  //  this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  if (param->Filter()->WidthOfOneBlock() == 1 &&
+      param->Filter()->HeightOfOneBlock() == 1) {
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1) {
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
+             param->Filter()->HeightOfOneBlock() == 3) {
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
   return true;
 }
 
 template <>
 void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto output = param.Output();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = param.Input()->CBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->WidthOfOneBlock();
+  int input_height = param.Input()->HeightOfOneBlock();
+
+  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  clSetKernelArg(kernel, 1, sizeof(int), &w);
+  clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+
   //  auto kernel = this->cl_helper_.KernelAt(0);
   //  size_t global_work_size[3] = {1, 2, 3};
   //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,