diff --git a/CMakeLists.txt b/CMakeLists.txt
index b40aee8088bb6889858aa76776e6b060030de7c2..553caa9c47b73cd86b1942983d6c201f2046964f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ if (DEBUGING)
     message(STATUS "debug")
     set(CMAKE_BUILD_TYPE Release)
     set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
+#    add_definitions(-DPADDLE_MOBILE_DEBUG)
 else ()
     set(CMAKE_BUILD_TYPE Release)
     set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
diff --git a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
index bb89ceb1397c4644f57cd649ccb7a532b643af04..9d0857a45e0766482e2dbb6ded77edb07517bc0f 100644
--- a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
@@ -1,7 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-__kernel void batchnorm(__private const int out_height,
-                        __private const int out_width,
+__kernel void batchnorm(__private const int out_width,
                         __read_only image2d_t input,
                         __read_only image2d_t new_scale_image,
                         __read_only image2d_t new_bias_image,
diff --git a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
index 54835ab89755fe9777b2e3019224029d94421bed..1f2e36687ab04be2b8c18b26e868b7709bc3c231 100644
--- a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) {
      int x = get_global_id(0);
diff --git a/src/operators/kernel/cl/cl_kernel/cl_common.h b/src/operators/kernel/cl/cl_kernel/cl_common.h
index d718ea48aee5c38498f3fd1b8b3a7ea4b1b8b6dc..34f36eb9a3ffbdc5781c974926ea4a7d5258636b 100644
--- a/src/operators/kernel/cl/cl_kernel/cl_common.h
+++ b/src/operators/kernel/cl/cl_kernel/cl_common.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 inline half4 activation(half4 in
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
index 7928fc33c04f0b5befd3e672a5f08e38983f480b..aa3eaedda5634294f231831d550296dfdba0dd48 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #define BIASE
 #define BATCH_NORM
 #define RELU
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
index c5ca2f659b7545fcf096ab205fe54ceded7d33ce..b8bf7e7d7d9fbb9eb9e930e9c1c3a58bb3391efc 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #define BIASE
 
 #include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
index b72e87288eb0dadbb23cb1b62de6d529eb9b6a8f..db3c8d3ca74dd25a827fcb594728ce81bfc1078a 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -56,7 +56,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
     if (out_c >= global_size_dim0 ||
         out_w >= global_size_dim1 ||
         out_nh >= global_size_dim2) {
-        printf(" out of range ");
         return;
     }
 
@@ -134,22 +133,22 @@ __kernel void conv_3x3(__private const int global_size_dim0,
                           (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
 
         for (int j = 0; j < 9; ++j) {
-            int2 fuck;
-            fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, fuck);
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
             output.x += dot(input[j], weight_x);
 
-            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, fuck);
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
             output.y += dot(input[j], weight_y);
 
-            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, fuck);
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
             output.z += dot(input[j], weight_z);
 
-            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, fuck);
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
             output.w += dot(input[j], weight_w);
         }
     }
@@ -321,6 +320,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
 
 }
 
+
 __kernel void conv_1x1(__private const int global_size_dim0,
                        __private const int global_size_dim1,
                        __private const int global_size_dim2,
@@ -349,92 +349,179 @@ __kernel void conv_1x1(__private const int global_size_dim0,
   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                            CLK_ADDRESS_CLAMP         |
                            CLK_FILTER_NEAREST;
+
   const uint kernelHXW = 1;
   int2 stride_xy = (int2)(stride, stride);
   int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
   int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
 #ifdef BIASE
     half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #else
     half4 output = 0.0f;
 #endif
 
-  int out_c_p = 0, out_w_p = 0, out_nh_p = 0;
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        half4 input = read_imageh(input_image, sampler, pos_in);
 
+        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
 /*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" after bias output4 = %v4hlf \n", out);
+        output.x = dot(input, weight0);
+        output.y = dot(input, weight1);
+        output.z = dot(input, weight2);
+        output.w = dot(input, weight3);
+*/
 
-  }
+        output = mad(input.x, weight0, output);
+        output = mad(input.y, weight1, output);
+        output = mad(input.z, weight2, output);
+        output = mad(input.w, weight3, output);
 
-*/
+   }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+  output = activation(output);
+#endif
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+}
+
+
+
+/*
+
+__kernel void conv_1x1_4(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,
+                       __private const int input_height,
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0) * 4;
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0));
+    half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0));
+    half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0));
+#else
+    half4 output0 = 0.0f;
+    half4 output1 = 0.0f;
+    half4 output2 = 0.0f;
+    half4 output3 = 0.0f;
+#endif
 
    for (int i = 0; i < input_c; ++i) {
         int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
         half4 input = read_imageh(input_image, sampler, pos_in);
 
-        half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0));
-        output.x += dot(input, weight_x);
+        half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
 
-        half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1));
-        output.y += dot(input, weight_y);
+        output0 = mad(input.x, weight0_0, output0);
+        output0 = mad(input.y, weight0_1, output0);
+        output0 = mad(input.z, weight0_2, output0);
+        output0 = mad(input.w, weight0_3, output0);
 
-        half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2));
-        output.z += dot(input, weight_z);
+        half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0));
+        half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1));
+        half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2));
+        half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3));
 
-        half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3));
-        output.w += dot(input, weight_w);
-/*
-        if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-            printf("x - %d \n", pos_in.x);
+        output1 = mad(input.x, weight1_0, output1);
+        output1 = mad(input.y, weight1_1, output1);
+        output1 = mad(input.z, weight1_2, output1);
+        output1 = mad(input.w, weight1_3, output1);
 
-            printf("y - %d \n", pos_in.y);
+        half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0));
+        half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1));
+        half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2));
+        half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3));
 
-            float4 in = (float4)(input.x, input.y, input.z, input.w);
-            printf("input4 = %v4hlf \n", in);
+        output2 = mad(input.x, weight2_0, output2);
+        output2 = mad(input.y, weight2_1, output2);
+        output2 = mad(input.z, weight2_2, output2);
+        output2 = mad(input.w, weight2_3, output2);
 
-            float4 w = (float4)(weight_x.x, weight_x.y, weight_x.z, weight_x.w);
-            printf("weight4 = %v4hlf \n", w);
+        half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0));
+        half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1));
+        half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2));
+        half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3));
 
-        }
-*/
-  }
-/*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf("output4 = %v4hlf \n", out);
+        output3 = mad(input.x, weight3_0, output3);
+        output3 = mad(input.y, weight3_1, output3);
+        output3 = mad(input.z, weight3_2, output3);
+        output3 = mad(input.w, weight3_3, output3);
 
-  }
-
-*/
+   }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
+    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0));
 
-/*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" after batch output4 = %v4hlf \n", out);
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0));
 
-  }
+    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0));
 
-*/
+    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0));
+
+#endif
 
 #ifdef RELU
-  output = activation(output);
+  output0 = activation(output0);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
 #endif
 
-/*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" after relu output4 = %v4hlf \n", out);
+  int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos0, output0);
 
-  }
 
-*/
+  int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos1, output1);
 
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos, output);
+
+  int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos2, output2);
+
+
+  int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos3, output3);
 }
+
+*/
diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
index d59e74519653edd754b52e8b3b4a4b740dd46438..3c3497f917d8a16c7c7e304edf00a4250066dce7 100644
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
@@ -15,4 +15,4 @@ limitations under the License. */
 #define BIASE
 #define BATCH_NORM
 #define RELU
-#include "conv_kernel.inc.cl"
\ No newline at end of file
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
index fddd09450422de02e91f91cfcea2e9bea2c6e049..2a5c823295c7562361433414cf35be81d2fbf00c 100644
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
@@ -12,4 +12,4 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "conv_kernel.inc.cl"
\ No newline at end of file
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
index 642ec025e151be0f2eafb457a3fa20ed2d292e8b..f304764868959ce028a8448c4d311db878cc1f6e 100644
--- a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
      int x = get_global_id(0);
diff --git a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
index 32d93ad93e3e181d8f8b2470d18968842236a595..80d741d859af633299120bfec9f4cfeeaeb47194 100644
--- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
@@ -1,3 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w)
  {
diff --git a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
index 8fba62e91f8f60f8d71c486b69a65cd61a192a5a..64bb1845b0bd2c04c8761845b90dbed9e391a77b 100644
--- a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
@@ -1,3 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 __kernel void fetch(__private const int in_height,
diff --git a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
index 18246fddcfb803adeae5cc9e2efeba1a4362aa2e..fc660941f8863a0056c4618f0207ae69533d3242 100644
--- a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
@@ -1,3 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define MIN_VALUE -FLT_MAX
 
@@ -72,4 +86,4 @@ __kernel void pool_avg(
   half4 avg = sum / num;
   const int pos_out_x = mad24(out_c, out_width, out_w);
   write_imageh(output, (int2)(pos_out_x, out_nh), avg);
-}
\ No newline at end of file
+}
diff --git a/src/operators/kernel/cl/cl_kernel/relu.cl b/src/operators/kernel/cl/cl_kernel/relu.cl
index baf28c9304bfb0b04301a5eb4afe4f658ba13072..cc8f9c3742f7794c51a5e04ac4edde617af0e388 100644
--- a/src/operators/kernel/cl/cl_kernel/relu.cl
+++ b/src/operators/kernel/cl/cl_kernel/relu.cl
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 __kernel void relu(__read_only image2d_t input,
@@ -54,4 +55,4 @@ __kernel void relu_p1(__read_only image2d_t input,
 
   half4 in = read_imageh(input, sampler, (int2)(x, y));
   write_imageh(output, (int2)(x, y), in);
-}
\ No newline at end of file
+}
diff --git a/src/operators/kernel/cl/cl_kernel/softmax.cl b/src/operators/kernel/cl/cl_kernel/softmax.cl
index 215ec69fc283dcb2b538300cb5591b2b9e4b6a13..a4514c70640dd6f9582a7362d489f42a58556dcb 100644
--- a/src/operators/kernel/cl/cl_kernel/softmax.cl
+++ b/src/operators/kernel/cl/cl_kernel/softmax.cl
@@ -33,17 +33,17 @@ __kernel void softmax(__read_only image2d_t input_image,
     maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w))));
   }
 
-
   half4 rsum = (half4)(0.0f);
+
   for (int i = 0; i < group; ++i) {
     half4 r = read_imageh(input_image, sampler, (int2)(i, 0));
-    rsum += convert_half4(exp(convert_float4(r - maxv)));
+    rsum += exp(r - maxv);
   }
 
   float sum = rsum.x + rsum.y + rsum.z + rsum.w;
 
   half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh));
-  half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum);
+  half4 result = exp(rr - maxv) / sum;
   write_imageh(output_image, (int2)(out_w, out_nh), result);
 }
 
diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
index 0bc348e1707f1b66ea6efc5ed09df458e66b871d..ec81044f08d41bab41652986c21fab655c714c8f 100644
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -125,10 +125,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
 
   param->SetOffset(offset);
 
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+
+  /*
+  if (param->Filter()->dims()[2] == 1 &&
+      param->Filter()->dims()[3] == 1 &&
+      (param->Filter()->dims()[0] % 16) == 0) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu conv 1x1 4";
+  }
+  */
+  if (param->Filter()->dims()[2] == 1 &&
+      param->Filter()->dims()[3] == 1) {
     param->Filter()->InitNImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-
     this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
     DLOG << " conv add bn relu conv 1x1";
   } else if (param->Filter()->dims()[1] == 1 &&
@@ -249,6 +260,23 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
   //  cl_event out_event = param.Output()->GetClEvent();
   //  cl_event wait_event = param.Input()->GetClEvent();
 
+  /*
+  if (param.Filter()->dims()[2] == 1 &&
+      param.Filter()->dims()[3] == 1 &&
+      param.Filter()->dims()[0] % 16 == 0) {
+    DLOG << " before modifi work size: " << default_work_size;
+
+    default_work_size[0] = default_work_size[0] / 4;
+
+    DLOG << " modification work size: " << default_work_size;
+    DLOG << " input dims " << param.Input()->dims();
+    DLOG << " output dims " << param.Output()->dims();
+    DLOG << " filter dims: " << param.Filter()->dims();
+    DLOG << " biase dims : " << param.Bias()->dims();
+
+  }
+  */
+
   status = clEnqueueNDRangeKernel(
       this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
       default_work_size.data(), NULL, 0, NULL, NULL);
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index 202d6608d50bdc9691e3739b2e721d427847e723..6bb972be5736442efbb83020179a8027b51e1cc0 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -13,19 +13,33 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+#include <iostream>
 
 #include "../test_helper.h"
 #include "framework/loader.h"
 
 int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::GPU_CL> loader;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
-  //  auto program = loader.Load(g_googlenet, true);
+
+  std::cout << " Begin load mobilenet " << std::endl;
+
+  auto program = loader.Load(std::string(g_mobilenet_mul), true);
+
+  std::cout << " End load mobilenet " << std::endl;
+
+  std::cout << " Begin load YOLO " << std::endl;
+
+  auto program1 = loader.Load(std::string(g_yolo_mul), true);
+
+  std::cout << " End load YOLO " << std::endl;
+
   //  auto program = loader.Load(g_mobilenet_ssd, true);
 
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params", false);
+//  auto program = loader.Load(std::string(g_ocr) + "/model",
+//                             std::string(g_ocr) + "/params", false);
   //  program.originProgram->Description("program desc: ");
+
   return 0;
 }
diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp
index fa8564be1515d0498ea4040da7e9712debe20cba..a5276d6e521855ad81e6b9e2edb58c271ae713d9 100644
--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -23,7 +23,7 @@ int main() {
   //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
   //                     std::string(g_mobilenet_detect) + "/params", true);
 
-  auto isok = paddle_mobile.Load(g_mobilenet, true);
+  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
   if (isok) {
     auto time2 = paddle_mobile::time();
     std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
@@ -33,24 +33,15 @@ int main() {
     std::vector<int64_t> dims{1, 3, 224, 224};
     GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
 
-    std::vector<float> vec_result;
-    //            = paddle_mobile.Predict(input, dims);
+    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
 
     auto time3 = paddle_mobile::time();
-    int max = 1;
+    int max = 10;
     for (int i = 0; i < max; ++i) {
       vec_result = paddle_mobile.Predict(input, dims);
     }
     auto time4 = paddle_mobile::time();
 
-    //    auto time3 = paddle_mobile::time();
-
-    //    for (int i = 0; i < 10; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-
-    //    auto time4 = paddle_mobile::time();
-
     std::cout << "predict cost :"
               << paddle_mobile::time_diff(time3, time4) / max << "ms"
               << std::endl;
diff --git a/test/net/test_yologpu.cpp b/test/net/test_yologpu.cpp
index 12d36a653fef1d465e19fba4e13830883dab8b6b..f6899277f8724a580ec5648cd8a9ddc7a6f4c7f2 100644
--- a/test/net/test_yologpu.cpp
+++ b/test/net/test_yologpu.cpp
@@ -23,7 +23,7 @@ int main() {
     //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
     //                     std::string(g_mobilenet_detect) + "/params", true);
 
-    auto isok = paddle_mobile.Load(g_yolo_mul, true);
+    auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
     if (isok) {
         auto time2 = paddle_mobile::time();
         std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"