From 190cb27806281e04262b7901c9bd1e1f0c70ade7 Mon Sep 17 00:00:00 2001
From: yangfei <yangfei11@baidu.com>
Date: Wed, 21 Nov 2018 10:09:05 +0800
Subject: [PATCH] imp transpose and reshape kernel

---
 src/framework/cl/cl_helper.h                  |   7 +
 src/framework/cl/cl_image.h                   |   8 +-
 src/framework/cl/cl_image_converter.cpp       |  37 ++++
 src/framework/cl/cl_image_converter.h         |  25 +++
 .../kernel/cl/cl_kernel/conv_kernel.inc.cl    |   8 +-
 .../kernel/cl/cl_kernel/prior_box_kernel.cl   |  40 ++++-
 src/operators/kernel/cl/cl_kernel/reshape.cl  | 158 ++++++++++++++++--
 .../kernel/cl/cl_kernel/transpose_kernel.cl   | 129 ++++++++++++++
 src/operators/kernel/cl/fetch_kernel.cpp      |  42 +++--
 src/operators/kernel/cl/prior_box_kernel.cpp  |  73 +++++---
 src/operators/kernel/cl/reshape_kernel.cpp    |  80 ++++++---
 src/operators/kernel/cl/transpose_kernel.cpp  |  38 ++++-
 src/operators/op_param.h                      |   2 +
 13 files changed, 547 insertions(+), 100 deletions(-)
 create mode 100644 src/operators/kernel/cl/cl_kernel/transpose_kernel.cl

diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h
index bea91ee24c..8a79d9bad7 100644
--- a/src/framework/cl/cl_helper.h
+++ b/src/framework/cl/cl_helper.h
@@ -61,9 +61,16 @@ class CLHelper {
       auto work_size_2 = n * h;
       return {work_size_0, work_size_1, work_size_2};
     } else if (image_dim.size() == 2) {
+      auto h = image_dim[0];
+      auto w = image_dim[1];
       return {1, image.ImageWidth(), image.ImageHeight()};
     } else if (image_dim.size() == 1) {
       return {1, image.ImageWidth(), 1};
+    } else if (image_dim.size() == 3) {
+      int c = image_dim[0];
+      int h = image_dim[1];
+      int w = image_dim[2];
+      return {(c + 3) / 4, w, h};
     }
     PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
   }
diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h
index 35f60d3b77..0c19661ede 100644
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -120,17 +120,19 @@ class CLImage {
     PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
                           " empty image tensor data shouldn't have value");
 
-    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
+    //    CLImageConverterFolder *folder_converter = new
+    //    CLImageConverterFolder();
+    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
 
     DLOG << " to get image dims ";
-    image_dims_ = folder_converter->InitImageDimInfoWith(dim);
+    image_dims_ = normal_converter->InitImageDimInfoWith(dim);
     DLOG << " end get image dims " << image_dims_;
 
     InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
 
     tensor_dims_ = dim;
     command_queue_ = command_queue;
-    image_converter_ = folder_converter;
+    image_converter_ = normal_converter;
     cl_event_ = CLEngine::Instance()->CreateEvent(context);
     initialized_ = true;
     DLOG << " end init cl image";
diff --git a/src/framework/cl/cl_image_converter.cpp b/src/framework/cl/cl_image_converter.cpp
index 13094a8d05..1e63bd0567 100644
--- a/src/framework/cl/cl_image_converter.cpp
+++ b/src/framework/cl/cl_image_converter.cpp
@@ -389,5 +389,42 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
   }
 }
 
+const DDim &CLImageConverterNormal::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+
+  width_of_one_block_ = W;
+  height_of_one_block_ = H;
+  c_block_ = width / W;
+
+  return make_ddim({width, height});
+}
+
+void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
+                        "tensor dim is not support ");
+
+  CLImageConverterDefault default_converter;
+  default_converter.NCHWToImage(tensor, image, tensor_dim);
+}
+
+void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  CLImageConverterDefault default_converter;
+  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+}
+
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_image_converter.h b/src/framework/cl/cl_image_converter.h
index 02887b0cd4..ad5994f852 100644
--- a/src/framework/cl/cl_image_converter.h
+++ b/src/framework/cl/cl_image_converter.h
@@ -63,6 +63,31 @@ class CLImageConverterFolder : public CLImageConverterBase {
   int height_of_one_block_;
 };
 
+class CLImageConverterNormal : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+
+  int GetCBlock() const { return c_block_; }
+
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+
 class CLImageConverterNWBlock : public CLImageConverterBase {
   const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
   void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
index 63e6e62345..2247df59fb 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -138,19 +138,19 @@ __kernel void conv_3x3(__private const int global_size_dim0,
             int2 pos_of_weight;
             pos_of_weight.x = i * 3 + j % 3;
             pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            float4 weight_x = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
             output.x += dot(input[j], weight_x);
 
             pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            float4 weight_y = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
             output.y += dot(input[j], weight_y);
 
             pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            float4 weight_z = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
             output.z += dot(input[j], weight_z);
 
             pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            float4 weight_w = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
             output.w += dot(input[j], weight_w);
         }
 */
diff --git a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
index 311a5a195a..699d381ce6 100644
--- a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
@@ -19,7 +19,9 @@ __kernel void prior_box(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __global float *box_width,
                         __global float *box_height,
-                        __write_only image2d_t output_image,
+                        __global float *variances_Buffer,
+                        __write_only image2d_t output_boxes,
+                        __write_only image2d_t output_variances,
                         __private const float step_width,
                         __private const float step_height,
                         __private const float offset,
@@ -44,16 +46,25 @@ __kernel void prior_box(__private const int global_size_dim0,
                         float center_y = ((float)out_n + offset) * step_height;
 
                         half4 output[4];
+                        half4 variances[4];
                         output[0].x = convert_half((center_x0 - box_width[out_h]) / (float)img_width);
                         output[1].x = convert_half((center_y - box_height[out_h]) / (float)img_height);
                         output[2].x = convert_half((center_x0 + box_width[out_h]) / (float)img_width);
                         output[3].x = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].x = convert_half(variances_Buffer[0]);
+                        variances[1].x = convert_half(variances_Buffer[1]);
+                        variances[2].x = convert_half(variances_Buffer[2]);
+                        variances[3].x = convert_half(variances_Buffer[3]);
 
                         if(C - 4 * out_c>=2){
                         output[0].y = convert_half((center_x1 - box_width[out_h]) / (float)img_width);
                         output[1].y = convert_half((center_y - box_height[out_h]) / (float)img_height);
                         output[2].y = convert_half((center_x1 + box_width[out_h]) / (float)img_width);
                         output[3].y = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].y = convert_half(variances_Buffer[0]);
+                        variances[1].y = convert_half(variances_Buffer[1]);
+                        variances[2].y = convert_half(variances_Buffer[2]);
+                        variances[3].y = convert_half(variances_Buffer[3]);
                         }else{
                          output[0].y = 0.0f;
                          output[1].y = 0.0f;
@@ -65,6 +76,10 @@ __kernel void prior_box(__private const int global_size_dim0,
                         output[1].z = convert_half((center_y - box_height[out_h]) / (float)img_height);
                         output[2].z = convert_half((center_x2 + box_width[out_h]) / (float)img_width);
                         output[3].z = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].z = convert_half(variances_Buffer[0]);
+                        variances[1].z = convert_half(variances_Buffer[1]);
+                        variances[2].z = convert_half(variances_Buffer[2]);
+                        variances[3].z = convert_half(variances_Buffer[3]);
                         }else{
                         output[0].z = 0.0f;
                         output[1].z = 0.0f;
@@ -76,6 +91,10 @@ __kernel void prior_box(__private const int global_size_dim0,
                         output[1].w = convert_half((center_y - box_height[out_h]) / (float)img_height);
                         output[2].w = convert_half((center_x3 + box_width[out_h]) / (float)img_width);
                         output[3].w = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].w = convert_half(variances_Buffer[0]);
+                        variances[1].w = convert_half(variances_Buffer[1]);
+                        variances[2].w = convert_half(variances_Buffer[2]);
+                        variances[3].w = convert_half(variances_Buffer[3]);
                         }else{
                         output[0].w = 0.0f;
                         output[1].w = 0.0f;
@@ -88,10 +107,21 @@ __kernel void prior_box(__private const int global_size_dim0,
                          output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
                          output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
                         }
+                        if(output_pos.x == 0 && output_pos.y == 1){
+                          float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x);
+                          printf("output = %v4hlf \n", out);
+
+                        }
+
+                        write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]);
+                        write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]);
+                        write_imageh(output_boxes, (int2)(output_pos.x + 2, output_pos.y), output[2]);
+                        write_imageh(output_boxes, (int2)(output_pos.x + 3, output_pos.y), output[3]);
+
+                        write_imageh(output_variances, (int2)(output_pos.x + 0, output_pos.y), variances[0]);
+                        write_imageh(output_variances, (int2)(output_pos.x + 1, output_pos.y), variances[1]);
+                        write_imageh(output_variances, (int2)(output_pos.x + 2, output_pos.y), variances[2]);
+                        write_imageh(output_variances, (int2)(output_pos.x + 3, output_pos.y), variances[3]);
 
-                        write_imageh(output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
-                        write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
-                        write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
-                        write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
 
 }
\ No newline at end of file
diff --git a/src/operators/kernel/cl/cl_kernel/reshape.cl b/src/operators/kernel/cl/cl_kernel/reshape.cl
index 0ffc64f15c..7957001c96 100644
--- a/src/operators/kernel/cl/cl_kernel/reshape.cl
+++ b/src/operators/kernel/cl/cl_kernel/reshape.cl
@@ -14,26 +14,150 @@ limitations under the License. */
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-__kernel void reshape(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const int d0,
-                      __private const int d1,
-                      __private const int d2,
-                      __private const int d3,
-                      __private const int x0,
-                      __private const int x1,
-                      __private const int x2,
-                      __private const int x3) {
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
+__kernel void reshape(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_C,
+                      __private const int out_H,
+                      __private const int out_W,
+                      __private const int in_W,
+                      __private const int in_H,
+                      __private const int in_Stride0,
+                      __private const int in_Stride1,
+                      __private const int in_Stride2,
+                      __private const int out_Stride0,
+                      __private const int out_Stride1,
+                      __private const int out_Stride2) {
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+                       const int out_c = get_global_id(0);
+                       const int out_w = get_global_id(1);
+                       const int out_nh = get_global_id(2);
+                       const int out_n = out_nh/out_H;
+                       const int out_h = out_nh%out_H;
+                       const int out_c0 = out_c * 4;
+                       const int out_c1 = out_c * 4 + 1;
+                       const int out_c2 = out_c * 4+ 2;
+                       const int out_c3 = out_c * 4+ 3;
+
+                       int count0 =  out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
+                       int count1 =  out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
+                       int count2 =  out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
+                       int count3 =  out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
+
+                       int in_n0 = count0/in_Stride2;
+                       int in_n1 = count1/in_Stride2;
+                       int in_n2 = count1/in_Stride2;
+                       int in_n3 = count2/in_Stride2;
+
+                       count0 = count0%in_Stride2;
+                       count1 = count1%in_Stride2;
+                       count2 = count2%in_Stride2;
+                       count3 = count3%in_Stride2;
+
+                       int in_c0 = count0/in_Stride1;
+                       int in_c1 = count1/in_Stride1;
+                       int in_c2 = count2/in_Stride1;
+                       int in_c3 = count3/in_Stride1;
+
+                       int in_h0 = (count0%in_Stride1)/in_Stride0;
+                       int in_h1 = (count1%in_Stride1)/in_Stride0;
+                       int in_h2 = (count2%in_Stride1)/in_Stride0;
+                       int in_h3 = (count3%in_Stride1)/in_Stride0;
+
+                       int in_w0 = (count0%in_Stride1)%in_Stride0;
+                       int in_w1 = (count1%in_Stride1)%in_Stride0;
+                       int in_w2 = (count2%in_Stride1)%in_Stride0;
+                       int in_w3 = (count3%in_Stride1)%in_Stride0;
+
+
+                       int2 input_pos0;
+                       int2 input_pos1;
+                       int2 input_pos2;
+                       int2 input_pos3;
+
+                       input_pos0.x = (in_c0/4) * in_W + in_w0;
+                       input_pos0.y = in_n0 * in_H + in_h0;
+
+                       input_pos1.x = (in_c1/4) * in_W + in_w1;
+                       input_pos1.y = in_n1 * in_H + in_h1;
+
+                       input_pos2.x = (in_c2/4) * in_W + in_w2;
+                       input_pos2.y = in_n2 * in_H + in_h2;
+
+                       input_pos3.x = (in_c3/4) * in_W + in_w3;
+                       input_pos3.y = in_n3 * in_H + in_h3;
+
+                       int2 output_pos;
+                       output_pos.x = out_c * out_W + out_w;
+                       output_pos.y = out_nh;
+
+                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+
+                       half4 input0;
+                       half4 input1;
+                       half4 input2;
+                       half4 input3;
+                       half4 output;
+
+                       input0 = read_imageh(input_image, sampler,input_pos0);
+                       if(in_c0%4==0){
+                          output.x = input0.x;
+                       }else if(in_c0%4==1){
+                          output.x = input0.y;
+                       }else if(in_c0%4==2){
+                          output.x = input0.z;
+                       }else{
+                          output.x = input0.w;
+                       }
+                       if(out_C - out_c * 4>=2){
+                          input1 = read_imageh(input_image, sampler,input_pos1);
+                       if(in_c1%4==0){
+                          output.y = input1.x;
+                       }else if(in_c1%4==1){
+                          output.y = input1.y;
+                       }else if(in_c1%4==2){
+                          output.y = input1.z;
+                       }else{
+                          output.y = input1.w;
+                       }
+
+                       }else{
+                          output.y = 0.0f;
+                       }
+
+                       if(out_C - out_c * 4>=3){
+                          input2 = read_imageh(input_image, sampler,input_pos2);
+
+                       if(in_c2%4==0){
+                          output.z = input2.x;
+                       }else if(in_c2%4==1){
+                          output.z = input1.y;
+                       }else if(in_c2%4==2){
+                          output.z = input2.z;
+                       }else{
+                          output.z = input2.w;
+                       }
+                       }else{
+                          output.z = 0.0f;
+                       }
 
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
+                       if(out_C - out_c * 4>=4){
+                          input3 = read_imageh(input_image, sampler,input_pos3);
+                       if(in_c3%4==0){
+                          output.w = input3.x;
+                       }else if(in_c3%4==1){
+                          output.w = input3.y;
+                       }else if(in_c3%4==2){
+                          output.w = input3.z;
+                       }else{
+                          output.w = input3.w;
+                       }
+                       }else{
+                          output.w = 0.0f;
+                       }
 
-  write_imageh(output, (int2)(x, y), in);
+                       write_imageh(output_image, output_pos, output);
 }
 
 
diff --git a/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl b/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
new file mode 100644
index 0000000000..93d2148c83
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
@@ -0,0 +1,129 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void transpose_4d( __read_only image2d_t input_image,
+                            __write_only image2d_t output_image,
+                            __private const int out_C,
+                            __private const int out_H,
+                            __private const int out_W,
+                            __private const int in_W
+                           ){
+                           const int out_c = get_global_id(0);
+                           const int out_w = get_global_id(1);
+                           const int out_nh = get_global_id(2);
+                           const int out_n =  1;
+                           const int out_h = out_nh%out_H;
+                           const int out_c0 = out_c * 4;
+                           const int out_c1 = out_c * 4 + 1;
+                           const int out_c2 = out_c * 4+ 2;
+                           const int out_c3 = out_c * 4+ 3;
+
+                           const int in_n = out_n;
+                           const int in_c = out_w / 4;
+                           const int in_h0 = out_c0;
+                           const int in_h1 = out_c1;
+                           const int in_h2 = out_c2;
+                           const int in_h3 = out_c3;
+                           const int in_w = out_h;
+
+                           int2 output_pos;
+                           output_pos.x = out_c * out_W + out_w;
+                           output_pos.y = out_nh;
+
+                           int2 input_pos0;
+                           int2 input_pos1;
+                           int2 input_pos2;
+                           int2 input_pos3;
+
+                           input_pos0.x = in_W * in_c + in_w;
+                           input_pos0.y = in_n * in_h0;
+
+                           input_pos1.x = in_W * in_c + in_w;
+                           input_pos1.y = in_n * in_h1;
+
+                           input_pos2.x = in_W * in_c + in_w;
+                           input_pos2.y = in_n * in_h2;
+
+                           input_pos3.x = in_W * in_c + in_w;
+                           input_pos3.y = in_n * in_h3;
+
+                           const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                         CLK_ADDRESS_CLAMP      |
+                                                         CLK_FILTER_NEAREST;
+
+                           half4 input0;
+                           half4 input1;
+                           half4 input2;
+                           half4 input3;
+                           half4 output;
+                           input0 = read_imageh(input_image, sampler,input_pos0);
+
+                            if(out_w%4==0){
+                            output.x = input0.x;
+                           }else if(out_w%4==1){
+                            output.x = input0.y;
+                           }else if(out_w%4==2){
+                            output.x = input0.z;
+                           }else{
+                            output.x = input0.w;
+                           }
+                           if(out_C - out_c * 4>=2){
+                            input1 = read_imageh(input_image, sampler,input_pos1);
+                            if(out_w%4==0){
+                            output.y = input1.x;
+                            }else if(out_w%4==1){
+                             output.y = input1.y;
+                            }else if(out_w%4==2){
+                             output.y = input1.z;
+                            }else{
+                             output.y = input1.w;
+                            }
+
+                           }else{
+                           output.y = 0.0f;
+                           }
+
+                           if(out_C - out_c * 4>=3){
+                            input2 = read_imageh(input_image, sampler,input_pos2);
+
+                          if(out_w%4==0){
+                            output.z = input2.x;
+                            }else if(out_w%4==1){
+                             output.z = input1.y;
+                            }else if(out_w%4==2){
+                             output.z = input2.z;
+                            }else{
+                             output.z = input2.w;
+                            }
+                           }else{
+                            output.z = 0.0f;
+                           }
+
+                           if(out_C - out_c * 4>=4){
+                            input3 = read_imageh(input_image, sampler,input_pos3);
+                           if(out_w%4==0){
+                            output.w = input3.x;
+                            }else if(out_w%4==1){
+                             output.w = input3.y;
+                            }else if(out_w%4==2){
+                             output.w = input3.z;
+                            }else{
+                             output.w = input3.w;
+                            }
+                           }else{
+                           output.w = 0.0f;
+                           }
+                           write_imageh(output_image, output_pos, output);
+}
\ No newline at end of file
diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp
index 8ea0b3ad3d..ded90ff43f 100644
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
@@ -22,11 +22,11 @@ namespace operators {
 
 template <>
 bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
-  if (param->InputX()->dims().size() <= 2) {
-    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
-  } else {
-    this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  }
+  //  if (param->InputX()->dims().size() <= 2) {
+  //    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
+  //  } else {
+  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  //  }
   return true;
 }
 
@@ -49,11 +49,11 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
 
   C = new_dims[1];
   in_height = new_dims[2];
-  if (dim.size() <= 2) {
-    in_width = param.InputX()->ImageWidth();
-  } else {
-    in_width = new_dims[3];
-  }
+  //  if (dim.size() <= 2) {
+  //    in_width = param.InputX()->ImageWidth();
+  //  } else {
+  in_width = new_dims[3];
+  //  }
 
   CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
                          this->cl_helper_.CLCommandQueue());
@@ -64,16 +64,16 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
   clSetKernelArg(kernel, 1, sizeof(int), &in_width);
   clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
   clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  if (dim.size() > 2) {
-    int size_ch = in_height * in_width;
-    int size_block = size_ch * 4;
-    int size_batch = size_ch * C;
-    int out_c = new_dims[1];
-    clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
-    clSetKernelArg(kernel, 5, sizeof(int), &size_block);
-    clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
-    clSetKernelArg(kernel, 7, sizeof(int), &out_c);
-  }
+  //  if (dim.size() > 2) {
+  int size_ch = in_height * in_width;
+  int size_block = size_ch * 4;
+  int size_batch = size_ch * C;
+  int out_c = new_dims[1];
+  clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
+  clSetKernelArg(kernel, 5, sizeof(int), &size_block);
+  clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
+  clSetKernelArg(kernel, 7, sizeof(int), &out_c);
+  //  }
 
   //  cl_event wait_event = param.InpdutX()->GetClEvent();
   clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
@@ -93,8 +93,6 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
   //            << "ms" << std::endl;
 
   memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
-  DLOG << *param.InputX();
-  DLOG << *out;
 }
 
 template class FetchKernel<GPU_CL, float>;
diff --git a/src/operators/kernel/cl/prior_box_kernel.cpp b/src/operators/kernel/cl/prior_box_kernel.cpp
index 7867cf5ab1..92764b379e 100644
--- a/src/operators/kernel/cl/prior_box_kernel.cpp
+++ b/src/operators/kernel/cl/prior_box_kernel.cpp
@@ -79,6 +79,8 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
       paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
   float *box_height = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
+  float *variancesptr =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * 4));
   int idx = 0;
   for (size_t s = 0; s < min_sizes.size(); ++s) {
     auto min_size = min_sizes[s];
@@ -112,6 +114,9 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
       }
     }
   }
+  for (int i = 0; i < variances.size(); i++) {
+    variancesptr[i] = variances[i];
+  }
   cl_int status;
   auto kernel = this->cl_helper_.KernelAt(0);
   auto default_work_size =
@@ -135,23 +140,33 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
   cl_mem box_height_Buffer =
       box_height_cl_tensor.mutable_with_data<float>(box_height);
 
-  DLOG << "c_block:" << c_block;
-  DLOG << "w:" << w;
-  DLOG << "nh:" << nh;
-  DLOG << "step_width:" << step_width;
-  DLOG << "step_height:" << step_height;
-  DLOG << "offset:" << offset;
-  DLOG << "img_width:" << img_width;
-  DLOG << "img_height:" << img_height;
-  DLOG << "num_priors:" << num_priors;
-  DLOG << "C:" << C;
-  DLOG << "isclip:" << isclip;
-  for (int i = 0; i < num_priors; i++) {
-    DLOG << box_width[i];
-  }
-  for (int i = 0; i < num_priors; i++) {
-    DLOG << box_height[i];
-  }
+  framework::CLTensor variances_cl_tensor(this->cl_helper_.CLContext(),
+                                          this->cl_helper_.CLCommandQueue());
+
+  std::vector<int64_t> variances_shape({4});
+  framework::DDim vddim = framework::make_ddim(variances_shape);
+
+  variances_cl_tensor.Resize(vddim);
+  cl_mem variances_Buffer =
+      variances_cl_tensor.mutable_with_data<float>(variancesptr);
+
+  //            DLOG << "c_block:" << c_block;
+  //            DLOG << "w:" << w;
+  //            DLOG << "nh:" << nh;
+  //            DLOG << "step_width:" << step_width;
+  //            DLOG << "step_height:" << step_height;
+  //            DLOG << "offset:" << offset;
+  //            DLOG << "img_width:" << img_width;
+  //            DLOG << "img_height:" << img_height;
+  //            DLOG << "num_priors:" << num_priors;
+  //            DLOG << "C:" << C;
+  //            DLOG << "isclip:" << isclip;
+  //            printf("param.MinMaxAspectRatiosOrder() =
+  //            %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i <
+  //            num_priors; i++) {
+  //                DLOG << box_width[i];
+  //                DLOG << box_height[i];
+  //            }
   status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
   CL_CHECK_ERRORS(status);
   status = clSetKernelArg(kernel, 1, sizeof(int), &w);
@@ -162,30 +177,36 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
   CL_CHECK_ERRORS(status);
   status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output_boxes);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &variances_Buffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output_boxes);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(float), &step_width);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_variances);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(float), &step_height);
+  status = clSetKernelArg(kernel, 8, sizeof(float), &step_width);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(float), &offset);
+  status = clSetKernelArg(kernel, 9, sizeof(float), &step_height);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &img_width);
+  status = clSetKernelArg(kernel, 10, sizeof(float), &offset);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &img_height);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &img_width);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &num_priors);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &img_height);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &C);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &num_priors);
   CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &isclip);
+  status = clSetKernelArg(kernel, 14, sizeof(int), &C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 15, sizeof(int), &isclip);
   CL_CHECK_ERRORS(status);
   size_t global_work_size[2] = {c_block, nh};
   status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                   NULL, global_work_size, NULL, 0, NULL, NULL);
   CL_CHECK_ERRORS(status);
+
   paddle_mobile::memory::Free(box_width);
   paddle_mobile::memory::Free(box_height);
+  paddle_mobile::memory::Free(variancesptr);
 }
 template class PriorBoxKernel<GPU_CL, float>;
 
diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp
index fb3aa9b52f..4e8d3e1d60 100644
--- a/src/operators/kernel/cl/reshape_kernel.cpp
+++ b/src/operators/kernel/cl/reshape_kernel.cpp
@@ -26,40 +26,76 @@ bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
 template <>
 void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
   auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
   const auto *input = param.InputX();
   auto *output = param.Out();
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  auto input_image = input->GetCLImage();
+  auto output_image = output->GetCLImage();
   const auto &inputDim = input->dims();
   const auto &outputDim = output->dims();
-  int dims[4] = {1, 1, 1, 1};
-  int odims[4] = {1, 1, 1, 1};
+  int input_dims[4] = {1, 1, 1, 1};
+  int output_dims[4] = {1, 1, 1, 1};
   // 1 1000 1 1
   for (int i = 0; i < inputDim.size(); i++) {
-    dims[4 - inputDim.size() + i] = inputDim[i];
+    input_dims[4 - inputDim.size() + i] = inputDim[i];
   }
 
   // 1 1 1 1000
   for (int i = 0; i < outputDim.size(); i++) {
-    odims[4 - outputDim.size() + i] = outputDim[i];
+    output_dims[4 - outputDim.size() + i] = outputDim[i];
   }
-  clSetKernelArg(kernel, 2, sizeof(cl_int), &dims);
-  clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]);
-  clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]);
-  clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]);
-  clSetKernelArg(kernel, 6, sizeof(cl_int), &odims);
-  clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]);
-  clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]);
-  clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]);
-  const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
 
-  //  cl_event out_event = param.Out()->GetClEvent();
-  //  cl_event wait_event = param.InputX()->GetClEvent();
-
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
-                         work_size, NULL, 0, NULL, NULL);
+  int out_C = output_dims[1];
+  int out_H = output_dims[2];
+  int out_W = output_dims[3];
+  int in_W = input_dims[3];
+  int in_H = input_dims[2];
+  int in_Stride0 = in_W;
+  int in_Stride1 = input_dims[2] * input_dims[3];
+  int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3];
+  int out_Stride0 = out_W;
+  int out_Stride1 = out_H * out_W;
+  int out_Stride2 = out_C * out_H * out_W;
+  DLOG << "out_C=" << out_C;
+  DLOG << "out_H=" << out_H;
+  DLOG << "out_W=" << out_W;
+  DLOG << "in_W=" << in_W;
+  DLOG << "default_work_size=" << default_work_size;
+  DLOG << "in_Stride0=" << in_Stride0;
+  DLOG << "in_Stride1=" << in_Stride1;
+  DLOG << "out_Stride0=" << out_Stride0;
+  DLOG << "out_Stride1=" << out_Stride1;
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &in_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2);
+  CL_CHECK_ERRORS(status);
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
 }
 
 template class ReshapeKernel<GPU_CL, float>;
diff --git a/src/operators/kernel/cl/transpose_kernel.cpp b/src/operators/kernel/cl/transpose_kernel.cpp
index 3b41753f18..b5be025a15 100644
--- a/src/operators/kernel/cl/transpose_kernel.cpp
+++ b/src/operators/kernel/cl/transpose_kernel.cpp
@@ -20,12 +20,48 @@ namespace operators {
 
 template <>
 bool TransposeKernel<GPU_CL, float>::Init(TransposeParam<GPU_CL> *param) {
+  if (param->Out()->dims().size() == 4) {
+    this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl");
+  }
   return true;
 }
 
 template <>
 void TransposeKernel<GPU_CL, float>::Compute(
-    const TransposeParam<GPU_CL> &param) {}
+    const TransposeParam<GPU_CL> &param) {
+  if (param.Out()->dims().size() == 4) {
+    auto kernel = this->cl_helper_.KernelAt(0);
+    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+    int out_C = param.Out()->dims()[1];
+    int out_H = param.Out()->dims()[2];
+    int out_W = param.Out()->dims()[3];
+    int in_W = param.InputX()->dims()[3];
+    auto output_image = param.Out()->GetCLImage();
+    auto input_image = param.InputX()->GetCLImage();
+    DLOG << "out_C=" << out_C;
+    DLOG << "out_H=" << out_H;
+    DLOG << "out_W=" << out_W;
+    DLOG << "in_C=" << in_W;
+    DLOG << "default_work_size=" << default_work_size;
+    cl_int status;
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
+    CL_CHECK_ERRORS(status);
+    status = clEnqueueNDRangeKernel(
+        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  }
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index cc87d3106f..4d4878789e 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -849,6 +849,8 @@ class PriorBoxParam : public OpParam {
     if (HasAttr("min_max_aspect_ratios_order", attrs)) {
       min_max_aspect_ratios_order_ =
           GetAttr<bool>("min_max_aspect_ratios_order", attrs);
+    } else {
+      min_max_aspect_ratios_order_ = false;
     }
     flip_ = GetAttr<bool>("flip", attrs);
     clip_ = GetAttr<bool>("clip", attrs);
-- 
GitLab