Merge pull request #1211 from codeWorm2015/opencl

commit for test

Merge pull request #1211 from codeWorm2015/opencl
commit for test
44643650 · Ray Liu · GitHub · 9e330eae · 56db3b86 · 44643650
19 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ if (DEBUGING)
    message(STATUS "debug")
    set(CMAKE_BUILD_TYPE Release)
    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
+#    add_definitions(-DPADDLE_MOBILE_DEBUG)
 else ()
    set(CMAKE_BUILD_TYPE Release)
    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")

--- a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

-__kernel void batchnorm(__private const int out_height,
-                        __private const int out_width,
+__kernel void batchnorm(__private const int out_width,
                        __read_only image2d_t input,
                        __read_only image2d_t new_scale_image,
                        __read_only image2d_t new_bias_image,

--- a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) {
     int x = get_global_id(0);

--- a/src/operators/kernel/cl/cl_kernel/cl_common.h
+++ b/src/operators/kernel/cl/cl_kernel/cl_common.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma once
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

 inline half4 activation(half4 in

--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-
 #define BIASE
 #define BATCH_NORM
 #define RELU

--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-
 #define BIASE

 #include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -56,7 +56,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    if (out_c >= global_size_dim0 ||
        out_w >= global_size_dim1 ||
        out_nh >= global_size_dim2) {
-        printf(" out of range ");
        return;
    }

@@ -134,22 +133,22 @@ __kernel void conv_3x3(__private const int global_size_dim0,
                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));

        for (int j = 0; j < 9; ++j) {
-            int2 fuck;
-            fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, fuck);
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
            output.x += dot(input[j], weight_x);

-            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, fuck);
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
            output.y += dot(input[j], weight_y);

-            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, fuck);
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
            output.z += dot(input[j], weight_z);

-            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, fuck);
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
            output.w += dot(input[j], weight_w);
        }
    }
@@ -321,6 +320,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,

 }

+
 __kernel void conv_1x1(__private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2,
@@ -349,92 +349,179 @@ __kernel void conv_1x1(__private const int global_size_dim0,
  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                           CLK_ADDRESS_CLAMP         |
                           CLK_FILTER_NEAREST;
+
  const uint kernelHXW = 1;
  int2 stride_xy = (int2)(stride, stride);
  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
 #ifdef BIASE
    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #else
    half4 output = 0.0f;
 #endif

-  int out_c_p = 0, out_w_p = 0, out_nh_p = 0;
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        half4 input = read_imageh(input_image, sampler, pos_in);

+        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
 /*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" after bias output4 = %v4hlf \n", out);
+        output.x = dot(input, weight0);
+        output.y = dot(input, weight1);
+        output.z = dot(input, weight2);
+        output.w = dot(input, weight3);
+*/

-  }
+        output = mad(input.x, weight0, output);
+        output = mad(input.y, weight1, output);
+        output = mad(input.z, weight2, output);
+        output = mad(input.w, weight3, output);

-*/
+   }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+  output = activation(output);
+#endif
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+}
+
+
+
+/*
+
+__kernel void conv_1x1_4(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,
+                       __private const int input_height,
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0) * 4;
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0));
+    half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0));
+    half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0));
+#else
+    half4 output0 = 0.0f;
+    half4 output1 = 0.0f;
+    half4 output2 = 0.0f;
+    half4 output3 = 0.0f;
+#endif

   for (int i = 0; i < input_c; ++i) {
        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
        half4 input = read_imageh(input_image, sampler, pos_in);

-        half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0));
-        output.x += dot(input, weight_x);
+        half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));

-        half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1));
-        output.y += dot(input, weight_y);
+        output0 = mad(input.x, weight0_0, output0);
+        output0 = mad(input.y, weight0_1, output0);
+        output0 = mad(input.z, weight0_2, output0);
+        output0 = mad(input.w, weight0_3, output0);

-        half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2));
-        output.z += dot(input, weight_z);
+        half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0));
+        half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1));
+        half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2));
+        half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3));

-        half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3));
-        output.w += dot(input, weight_w);
-/*
-        if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-            printf("x - %d \n", pos_in.x);
+        output1 = mad(input.x, weight1_0, output1);
+        output1 = mad(input.y, weight1_1, output1);
+        output1 = mad(input.z, weight1_2, output1);
+        output1 = mad(input.w, weight1_3, output1);

-            printf("y - %d \n", pos_in.y);
+        half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0));
+        half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1));
+        half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2));
+        half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3));

-            float4 in = (float4)(input.x, input.y, input.z, input.w);
-            printf("input4 = %v4hlf \n", in);
+        output2 = mad(input.x, weight2_0, output2);
+        output2 = mad(input.y, weight2_1, output2);
+        output2 = mad(input.z, weight2_2, output2);
+        output2 = mad(input.w, weight2_3, output2);

-            float4 w = (float4)(weight_x.x, weight_x.y, weight_x.z, weight_x.w);
-            printf("weight4 = %v4hlf \n", w);
+        half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0));
+        half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1));
+        half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2));
+        half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3));

-        }
-*/
-  }
-/*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf("output4 = %v4hlf \n", out);
+        output3 = mad(input.x, weight3_0, output3);
+        output3 = mad(input.y, weight3_1, output3);
+        output3 = mad(input.z, weight3_2, output3);
+        output3 = mad(input.w, weight3_3, output3);

-  }
-
-*/
+   }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
+    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0));

-/*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" after batch output4 = %v4hlf \n", out);
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0));

-  }
+    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0));

-*/
+    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0));
+
+#endif

 #ifdef RELU
-  output = activation(output);
+  output0 = activation(output0);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
 #endif

-/*
-  if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" after relu output4 = %v4hlf \n", out);
+  int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos0, output0);

-  }

-*/
+  int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos1, output1);

-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos, output);
+
+  int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos2, output2);
+
+
+  int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos3, output3);
 }
+
+*/
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
@@ -15,4 +15,4 @@ limitations under the License. */
 #define BIASE
 #define BATCH_NORM
 #define RELU
-#include "conv_kernel.inc.cl"
\ No newline at end of file
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
@@ -12,4 +12,4 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "conv_kernel.inc.cl"
\ No newline at end of file
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
     int x = get_global_id(0);

--- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w)
 {

--- a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

 __kernel void fetch(__private const int in_height,

--- a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define MIN_VALUE -FLT_MAX

@@ -72,4 +86,4 @@ __kernel void pool_avg(
  half4 avg = sum / num;
  const int pos_out_x = mad24(out_c, out_width, out_w);
  write_imageh(output, (int2)(pos_out_x, out_nh), avg);
-}
\ No newline at end of file
+}
--- a/src/operators/kernel/cl/cl_kernel/relu.cl
+++ b/src/operators/kernel/cl/cl_kernel/relu.cl
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

 __kernel void relu(__read_only image2d_t input,
@@ -54,4 +55,4 @@ __kernel void relu_p1(__read_only image2d_t input,

  half4 in = read_imageh(input, sampler, (int2)(x, y));
  write_imageh(output, (int2)(x, y), in);
-}
\ No newline at end of file
+}
--- a/src/operators/kernel/cl/cl_kernel/softmax.cl
+++ b/src/operators/kernel/cl/cl_kernel/softmax.cl
@@ -33,17 +33,17 @@ __kernel void softmax(__read_only image2d_t input_image,
    maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w))));
  }

-
  half4 rsum = (half4)(0.0f);
+
  for (int i = 0; i < group; ++i) {
    half4 r = read_imageh(input_image, sampler, (int2)(i, 0));
-    rsum += convert_half4(exp(convert_float4(r - maxv)));
+    rsum += exp(r - maxv);
  }

  float sum = rsum.x + rsum.y + rsum.z + rsum.w;

  half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh));
-  half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum);
+  half4 result = exp(rr - maxv) / sum;
  write_imageh(output_image, (int2)(out_w, out_nh), result);
 }


--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -125,10 +125,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(

  param->SetOffset(offset);

-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+
+  /*
+  if (param->Filter()->dims()[2] == 1 &&
+      param->Filter()->dims()[3] == 1 &&
+      (param->Filter()->dims()[0] % 16) == 0) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu conv 1x1 4";
+  }
+  */
+  if (param->Filter()->dims()[2] == 1 &&
+      param->Filter()->dims()[3] == 1) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-
    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
    DLOG << " conv add bn relu conv 1x1";
  } else if (param->Filter()->dims()[1] == 1 &&
@@ -249,6 +260,23 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
  //  cl_event out_event = param.Output()->GetClEvent();
  //  cl_event wait_event = param.Input()->GetClEvent();

+  /*
+  if (param.Filter()->dims()[2] == 1 &&
+      param.Filter()->dims()[3] == 1 &&
+      param.Filter()->dims()[0] % 16 == 0) {
+    DLOG << " before modifi work size: " << default_work_size;
+
+    default_work_size[0] = default_work_size[0] / 4;
+
+    DLOG << " modification work size: " << default_work_size;
+    DLOG << " input dims " << param.Input()->dims();
+    DLOG << " output dims " << param.Output()->dims();
+    DLOG << " filter dims: " << param.Filter()->dims();
+    DLOG << " biase dims : " << param.Bias()->dims();
+
+  }
+  */
+
  status = clEnqueueNDRangeKernel(
      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
      default_work_size.data(), NULL, 0, NULL, NULL);

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -13,19 +13,33 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <string>
+#include <iostream>

 #include "../test_helper.h"
 #include "framework/loader.h"

 int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::GPU_CL> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  //  auto program = loader.Load(g_googlenet, true);
+
+  std::cout << " Begin load mobilenet " << std::endl;
+
+  auto program = loader.Load(std::string(g_mobilenet_mul), true);
+
+  std::cout << " End load mobilenet " << std::endl;
+
+  std::cout << " Begin load YOLO " << std::endl;
+
+  auto program1 = loader.Load(std::string(g_yolo_mul), true);
+
+  std::cout << " End load YOLO " << std::endl;
+
  //  auto program = loader.Load(g_mobilenet_ssd, true);

-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params", false);
+//  auto program = loader.Load(std::string(g_ocr) + "/model",
+//                             std::string(g_ocr) + "/params", false);
  //  program.originProgram->Description("program desc: ");
+
  return 0;
 }
--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -23,7 +23,7 @@ int main() {
  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
  //                     std::string(g_mobilenet_detect) + "/params", true);

-  auto isok = paddle_mobile.Load(g_mobilenet, true);
+  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
  if (isok) {
    auto time2 = paddle_mobile::time();
    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
@@ -33,24 +33,15 @@ int main() {
    std::vector<int64_t> dims{1, 3, 224, 224};
    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);

-    std::vector<float> vec_result;
-    //            = paddle_mobile.Predict(input, dims);
+    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);

    auto time3 = paddle_mobile::time();
-    int max = 1;
+    int max = 10;
    for (int i = 0; i < max; ++i) {
      vec_result = paddle_mobile.Predict(input, dims);
    }
    auto time4 = paddle_mobile::time();

-    //    auto time3 = paddle_mobile::time();
-
-    //    for (int i = 0; i < 10; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-
-    //    auto time4 = paddle_mobile::time();
-
    std::cout << "predict cost :"
              << paddle_mobile::time_diff(time3, time4) / max << "ms"
              << std::endl;

--- a/test/net/test_yologpu.cpp
+++ b/test/net/test_yologpu.cpp
@@ -23,7 +23,7 @@ int main() {
    //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
    //                     std::string(g_mobilenet_detect) + "/params", true);

-    auto isok = paddle_mobile.Load(g_yolo_mul, true);
+    auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
    if (isok) {
        auto time2 = paddle_mobile::time();
        std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"