Merge pull request #1109 from codeWorm2015/opencl

update fusion conv kernel

Merge pull request #1109 from codeWorm2015/opencl
update fusion conv kernel
6df2ad4a · Ray Liu · GitHub · 3cb1bad6 · f791a439 · 6df2ad4a
5 changed file
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -57,7 +57,9 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}

 template <typename Dtype>
 void OperatorBase<Dtype>::Run() {
+  DLOG << " ----- Begin run impl --- " << type_ << " ----- ";
  RunImpl();
+  DLOG << " ----- End run impl --- " << type_ << " ----- ";
 #ifdef PADDLE_MOBILE_DEBUG
  DLOG << "-------------" << type_ << "----------------------------";
  vector<string> input_keys = GetInputKeys();

--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
@@ -14,7 +14,6 @@ limitations under the License. */

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

-
 #define BIASE
 #define BATCH_NORM

@@ -54,21 +53,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    ouput_pos_in_one_block.x = out_w;
    ouput_pos_in_one_block.y = out_nh;

+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
    int2 in_pos_in_one_block;
    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;

 #ifdef BIASE
-    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #else
    half4 output = 0.0f;
 #endif

   half4 input[9];

-   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP         |
-                              CLK_FILTER_NEAREST;
+

   for (int i = 0; i < input_c; ++i) {
        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
@@ -139,7 +141,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
@@ -250,7 +252,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
    }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
@@ -321,7 +323,7 @@ __kernel void conv_1x1(__private const int global_size_dim0,
  }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU

--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
@@ -56,17 +56,19 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;

+   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP         |
+                              CLK_FILTER_NEAREST;
+
 #ifdef BIASE
-    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #else
    half4 output = 0.0f;
 #endif

   half4 input[9];

-   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP         |
-                              CLK_FILTER_NEAREST;
+

   for (int i = 0; i < input_c; ++i) {
        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
@@ -137,7 +139,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
@@ -248,7 +250,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
    }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
@@ -319,7 +321,7 @@ __kernel void conv_1x1(__private const int global_size_dim0,
  }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU

--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -117,7 +117,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
  auto biase = param.Bias()->GetCLImage();
  auto new_scale = param.NewScale()->GetCLImage();
  auto new_bias = param.NewBias()->GetCLImage();
-  auto output = param.Output();
+  auto output = param.Output()->GetCLImage();
  int stride = param.Strides()[0];
  int offset = param.Offset();
  int input_c = param.Input()->CBlock();

--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -23,7 +23,7 @@ int main() {
  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
  //                     std::string(g_mobilenet_detect) + "/params", true);

-  auto isok = paddle_mobile.Load(g_mobilenet, false);
+  auto isok = paddle_mobile.Load(g_mobilenet, true);
  if (isok) {
    auto time2 = paddle_mobile::time();
    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"