From df53e7f95f80ecced71273c8eef985501cf929e8 Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Fri, 26 Oct 2018 15:24:05 +0800
Subject: [PATCH]  commit for test

---
 CMakeLists.txt                                |   2 +-
 .../kernel/cl/cl_kernel/batchnorm_kernel.cl   |  17 +-
 .../kernel/cl/cl_kernel/channel_add_kernel.cl |   1 +
 src/operators/kernel/cl/cl_kernel/cl_common.h |   2 +
 .../cl/cl_kernel/conv_add_bn_relu_kernel.cl   |   1 -
 .../kernel/cl/cl_kernel/conv_add_kernel.cl    |   1 -
 .../kernel/cl/cl_kernel/conv_kernel.inc.cl    | 207 +++++++++++++-----
 .../depthwise_conv_add_bn_relu_kernel.cl      |   2 +-
 .../cl/cl_kernel/depthwise_conv_kernel.cl     |   2 +-
 .../cl/cl_kernel/elementwise_add_kernel.cl    |   1 +
 .../kernel/cl/cl_kernel/feed_kernel.cl        |  14 ++
 .../kernel/cl/cl_kernel/fetch_kernel.cl       |  14 ++
 .../kernel/cl/cl_kernel/pool_kernel.cl        |  16 +-
 src/operators/kernel/cl/cl_kernel/relu.cl     |   3 +-
 src/operators/kernel/cl/cl_kernel/softmax.cl  |   6 +-
 .../kernel/cl/conv_add_bn_relu_kernel.cpp     |  32 ++-
 test/framework/test_load.cpp                  |  22 +-
 test/net/test_mobilenet_GPU.cpp               |  15 +-
 test/net/test_yologpu.cpp                     |   2 +-
 19 files changed, 269 insertions(+), 91 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b40aee8088..553caa9c47 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ if (DEBUGING)
     message(STATUS "debug")
     set(CMAKE_BUILD_TYPE Release)
     set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
+#    add_definitions(-DPADDLE_MOBILE_DEBUG)
 else ()
     set(CMAKE_BUILD_TYPE Release)
     set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
diff --git a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
index bb89ceb139..9d0857a45e 100644
--- a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
@@ -1,7 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-__kernel void batchnorm(__private const int out_height,
-                        __private const int out_width,
+__kernel void batchnorm(__private const int out_width,
                         __read_only image2d_t input,
                         __read_only image2d_t new_scale_image,
                         __read_only image2d_t new_bias_image,
diff --git a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
index 54835ab897..1f2e36687a 100644
--- a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) {
      int x = get_global_id(0);
diff --git a/src/operators/kernel/cl/cl_kernel/cl_common.h b/src/operators/kernel/cl/cl_kernel/cl_common.h
index d718ea48ae..34f36eb9a3 100644
--- a/src/operators/kernel/cl/cl_kernel/cl_common.h
+++ b/src/operators/kernel/cl/cl_kernel/cl_common.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 inline half4 activation(half4 in
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
index 7928fc33c0..aa3eaedda5 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #define BIASE
 #define BATCH_NORM
 #define RELU
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
index c5ca2f659b..b8bf7e7d7d 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #define BIASE
 
 #include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
index b72e87288e..db3c8d3ca7 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -56,7 +56,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
     if (out_c >= global_size_dim0 ||
         out_w >= global_size_dim1 ||
         out_nh >= global_size_dim2) {
-        printf(" out of range ");
         return;
     }
 
@@ -134,22 +133,22 @@ __kernel void conv_3x3(__private const int global_size_dim0,
                           (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
 
         for (int j = 0; j < 9; ++j) {
-            int2 fuck;
-            fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, fuck);
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
             output.x += dot(input[j], weight_x);
 
-            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, fuck);
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
             output.y += dot(input[j], weight_y);
 
-            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, fuck);
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
             output.z += dot(input[j], weight_z);
 
-            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, fuck);
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
             output.w += dot(input[j], weight_w);
         }
     }
@@ -321,6 +320,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
 
 }
 
+
 __kernel void conv_1x1(__private const int global_size_dim0,
                        __private const int global_size_dim1,
                        __private const int global_size_dim2,
@@ -349,92 +349,179 @@ __kernel void conv_1x1(__private const int global_size_dim0,
   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                            CLK_ADDRESS_CLAMP         |
                            CLK_FILTER_NEAREST;
+
   const uint kernelHXW = 1;
   int2 stride_xy = (int2)(stride, stride);
   int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
   int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
 #ifdef BIASE
     half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #else
     half4 output = 0.0f;
 #endif
 
-  int out_c_p = 0, out_w_p = 0, out_nh_p = 0;
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        half4 input = read_imageh(input_image, sampler, pos_in);
 
+        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
 /*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" after bias output4 = %v4hlf \n", out);
+        output.x = dot(input, weight0);
+        output.y = dot(input, weight1);
+        output.z = dot(input, weight2);
+        output.w = dot(input, weight3);
+*/
 
-  }
+        output = mad(input.x, weight0, output);
+        output = mad(input.y, weight1, output);
+        output = mad(input.z, weight2, output);
+        output = mad(input.w, weight3, output);
 
-*/
+   }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+  output = activation(output);
+#endif
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+}
+
+
+
+/*
+
+__kernel void conv_1x1_4(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,
+                       __private const int input_height,
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0) * 4;
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0));
+    half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0));
+    half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0));
+#else
+    half4 output0 = 0.0f;
+    half4 output1 = 0.0f;
+    half4 output2 = 0.0f;
+    half4 output3 = 0.0f;
+#endif
 
    for (int i = 0; i < input_c; ++i) {
         int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
         half4 input = read_imageh(input_image, sampler, pos_in);
 
-        half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0));
-        output.x += dot(input, weight_x);
+        half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
 
-        half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1));
-        output.y += dot(input, weight_y);
+        output0 = mad(input.x, weight0_0, output0);
+        output0 = mad(input.y, weight0_1, output0);
+        output0 = mad(input.z, weight0_2, output0);
+        output0 = mad(input.w, weight0_3, output0);
 
-        half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2));
-        output.z += dot(input, weight_z);
+        half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0));
+        half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1));
+        half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2));
+        half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3));
 
-        half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3));
-        output.w += dot(input, weight_w);
-/*
-        if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-            printf("x - %d \n", pos_in.x);
+        output1 = mad(input.x, weight1_0, output1);
+        output1 = mad(input.y, weight1_1, output1);
+        output1 = mad(input.z, weight1_2, output1);
+        output1 = mad(input.w, weight1_3, output1);
 
-            printf("y - %d \n", pos_in.y);
+        half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0));
+        half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1));
+        half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2));
+        half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3));
 
-            float4 in = (float4)(input.x, input.y, input.z, input.w);
-            printf("input4 = %v4hlf \n", in);
+        output2 = mad(input.x, weight2_0, output2);
+        output2 = mad(input.y, weight2_1, output2);
+        output2 = mad(input.z, weight2_2, output2);
+        output2 = mad(input.w, weight2_3, output2);
 
-            float4 w = (float4)(weight_x.x, weight_x.y, weight_x.z, weight_x.w);
-            printf("weight4 = %v4hlf \n", w);
+        half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0));
+        half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1));
+        half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2));
+        half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3));
 
-        }
-*/
-  }
-/*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf("output4 = %v4hlf \n", out);
+        output3 = mad(input.x, weight3_0, output3);
+        output3 = mad(input.y, weight3_1, output3);
+        output3 = mad(input.z, weight3_2, output3);
+        output3 = mad(input.w, weight3_3, output3);
 
-  }
-
-*/
+   }
 
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
+    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0));
 
-/*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" after batch output4 = %v4hlf \n", out);
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0));
 
-  }
+    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0));
 
-*/
+    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0));
+
+#endif
 
 #ifdef RELU
-  output = activation(output);
+  output0 = activation(output0);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
 #endif
 
-/*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" after relu output4 = %v4hlf \n", out);
+  int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos0, output0);
 
-  }
 
-*/
+  int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos1, output1);
 
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos, output);
+
+  int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos2, output2);
+
+
+  int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos3, output3);
 }
+
+*/
diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
index d59e745196..3c3497f917 100644
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
@@ -15,4 +15,4 @@ limitations under the License. */
 #define BIASE
 #define BATCH_NORM
 #define RELU
-#include "conv_kernel.inc.cl"
\ No newline at end of file
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
index fddd094504..2a5c823295 100644
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
@@ -12,4 +12,4 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "conv_kernel.inc.cl"
\ No newline at end of file
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
index 642ec025e1..f304764868 100644
--- a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
      int x = get_global_id(0);
diff --git a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
index 32d93ad93e..80d741d859 100644
--- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
@@ -1,3 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w)
  {
diff --git a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
index 8fba62e91f..64bb1845b0 100644
--- a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
@@ -1,3 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 __kernel void fetch(__private const int in_height,
diff --git a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
index 18246fddcf..fc660941f8 100644
--- a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
@@ -1,3 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define MIN_VALUE -FLT_MAX
 
@@ -72,4 +86,4 @@ __kernel void pool_avg(
   half4 avg = sum / num;
   const int pos_out_x = mad24(out_c, out_width, out_w);
   write_imageh(output, (int2)(pos_out_x, out_nh), avg);
-}
\ No newline at end of file
+}
diff --git a/src/operators/kernel/cl/cl_kernel/relu.cl b/src/operators/kernel/cl/cl_kernel/relu.cl
index baf28c9304..cc8f9c3742 100644
--- a/src/operators/kernel/cl/cl_kernel/relu.cl
+++ b/src/operators/kernel/cl/cl_kernel/relu.cl
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 __kernel void relu(__read_only image2d_t input,
@@ -54,4 +55,4 @@ __kernel void relu_p1(__read_only image2d_t input,
 
   half4 in = read_imageh(input, sampler, (int2)(x, y));
   write_imageh(output, (int2)(x, y), in);
-}
\ No newline at end of file
+}
diff --git a/src/operators/kernel/cl/cl_kernel/softmax.cl b/src/operators/kernel/cl/cl_kernel/softmax.cl
index 215ec69fc2..a4514c7064 100644
--- a/src/operators/kernel/cl/cl_kernel/softmax.cl
+++ b/src/operators/kernel/cl/cl_kernel/softmax.cl
@@ -33,17 +33,17 @@ __kernel void softmax(__read_only image2d_t input_image,
     maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w))));
   }
 
-
   half4 rsum = (half4)(0.0f);
+
   for (int i = 0; i < group; ++i) {
     half4 r = read_imageh(input_image, sampler, (int2)(i, 0));
-    rsum += convert_half4(exp(convert_float4(r - maxv)));
+    rsum += exp(r - maxv);
   }
 
   float sum = rsum.x + rsum.y + rsum.z + rsum.w;
 
   half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh));
-  half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum);
+  half4 result = exp(rr - maxv) / sum;
   write_imageh(output_image, (int2)(out_w, out_nh), result);
 }
 
diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
index 0bc348e170..ec81044f08 100644
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -125,10 +125,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
 
   param->SetOffset(offset);
 
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+
+  /*
+  if (param->Filter()->dims()[2] == 1 &&
+      param->Filter()->dims()[3] == 1 &&
+      (param->Filter()->dims()[0] % 16) == 0) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu conv 1x1 4";
+  }
+  */
+  if (param->Filter()->dims()[2] == 1 &&
+      param->Filter()->dims()[3] == 1) {
     param->Filter()->InitNImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-
     this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
     DLOG << " conv add bn relu conv 1x1";
   } else if (param->Filter()->dims()[1] == 1 &&
@@ -249,6 +260,23 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
   //  cl_event out_event = param.Output()->GetClEvent();
   //  cl_event wait_event = param.Input()->GetClEvent();
 
+  /*
+  if (param.Filter()->dims()[2] == 1 &&
+      param.Filter()->dims()[3] == 1 &&
+      param.Filter()->dims()[0] % 16 == 0) {
+    DLOG << " before modifi work size: " << default_work_size;
+
+    default_work_size[0] = default_work_size[0] / 4;
+
+    DLOG << " modification work size: " << default_work_size;
+    DLOG << " input dims " << param.Input()->dims();
+    DLOG << " output dims " << param.Output()->dims();
+    DLOG << " filter dims: " << param.Filter()->dims();
+    DLOG << " biase dims : " << param.Bias()->dims();
+
+  }
+  */
+
   status = clEnqueueNDRangeKernel(
       this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
       default_work_size.data(), NULL, 0, NULL, NULL);
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index 202d6608d5..6bb972be57 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -13,19 +13,33 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+#include <iostream>
 
 #include "../test_helper.h"
 #include "framework/loader.h"
 
 int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::GPU_CL> loader;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
-  //  auto program = loader.Load(g_googlenet, true);
+
+  std::cout << " Begin load mobilenet " << std::endl;
+
+  auto program = loader.Load(std::string(g_mobilenet_mul), true);
+
+  std::cout << " End load mobilenet " << std::endl;
+
+  std::cout << " Begin load YOLO " << std::endl;
+
+  auto program1 = loader.Load(std::string(g_yolo_mul), true);
+
+  std::cout << " End load YOLO " << std::endl;
+
   //  auto program = loader.Load(g_mobilenet_ssd, true);
 
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params", false);
+//  auto program = loader.Load(std::string(g_ocr) + "/model",
+//                             std::string(g_ocr) + "/params", false);
   //  program.originProgram->Description("program desc: ");
+
   return 0;
 }
diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp
index fa8564be15..a5276d6e52 100644
--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -23,7 +23,7 @@ int main() {
   //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
   //                     std::string(g_mobilenet_detect) + "/params", true);
 
-  auto isok = paddle_mobile.Load(g_mobilenet, true);
+  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
   if (isok) {
     auto time2 = paddle_mobile::time();
     std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
@@ -33,24 +33,15 @@ int main() {
     std::vector<int64_t> dims{1, 3, 224, 224};
     GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
 
-    std::vector<float> vec_result;
-    //            = paddle_mobile.Predict(input, dims);
+    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
 
     auto time3 = paddle_mobile::time();
-    int max = 1;
+    int max = 10;
     for (int i = 0; i < max; ++i) {
       vec_result = paddle_mobile.Predict(input, dims);
     }
     auto time4 = paddle_mobile::time();
 
-    //    auto time3 = paddle_mobile::time();
-
-    //    for (int i = 0; i < 10; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-
-    //    auto time4 = paddle_mobile::time();
-
     std::cout << "predict cost :"
               << paddle_mobile::time_diff(time3, time4) / max << "ms"
               << std::endl;
diff --git a/test/net/test_yologpu.cpp b/test/net/test_yologpu.cpp
index 12d36a653f..f6899277f8 100644
--- a/test/net/test_yologpu.cpp
+++ b/test/net/test_yologpu.cpp
@@ -23,7 +23,7 @@ int main() {
     //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
     //                     std::string(g_mobilenet_detect) + "/params", true);
 
-    auto isok = paddle_mobile.Load(g_yolo_mul, true);
+    auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
     if (isok) {
         auto time2 = paddle_mobile::time();
         std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-- 
GitLab