Merge remote-tracking branch 'upstream/develop' into develop

8b71275c · zhangyang · 1c050149 · af65b34c · 8b71275c · 8b71275c
16 changed file
--- a/src/framework/cl/cl_helper.h
+++ b/src/framework/cl/cl_helper.h
@@ -61,9 +61,16 @@ class CLHelper {
      auto work_size_2 = n * h;
      return {work_size_0, work_size_1, work_size_2};
    } else if (image_dim.size() == 2) {
+      auto h = image_dim[0];
+      auto w = image_dim[1];
      return {1, image.ImageWidth(), image.ImageHeight()};
    } else if (image_dim.size() == 1) {
      return {1, image.ImageWidth(), 1};
+    } else if (image_dim.size() == 3) {
+      int c = image_dim[0];
+      int h = image_dim[1];
+      int w = image_dim[2];
+      return {(c + 3) / 4, w, h};
    }
    PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
  }

--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -120,17 +120,19 @@ class CLImage {
    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
                          " empty image tensor data shouldn't have value");

-    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
+    //    CLImageConverterFolder *folder_converter = new
+    //    CLImageConverterFolder();
+    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();

    DLOG << " to get image dims ";
-    image_dims_ = folder_converter->InitImageDimInfoWith(dim);
+    image_dims_ = normal_converter->InitImageDimInfoWith(dim);
    DLOG << " end get image dims " << image_dims_;

    InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);

    tensor_dims_ = dim;
    command_queue_ = command_queue;
-    image_converter_ = folder_converter;
+    image_converter_ = normal_converter;
    cl_event_ = CLEngine::Instance()->CreateEvent(context);
    initialized_ = true;
    DLOG << " end init cl image";

--- a/src/framework/cl/cl_image_converter.cpp
+++ b/src/framework/cl/cl_image_converter.cpp
@@ -389,5 +389,42 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
  }
 }

+const DDim &CLImageConverterNormal::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+
+  width_of_one_block_ = W;
+  height_of_one_block_ = H;
+  c_block_ = width / W;
+
+  return make_ddim({width, height});
+}
+
+void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
+                        "tensor dim is not support ");
+
+  CLImageConverterDefault default_converter;
+  default_converter.NCHWToImage(tensor, image, tensor_dim);
+}
+
+void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  CLImageConverterDefault default_converter;
+  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+}
+
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/cl/cl_image_converter.h
+++ b/src/framework/cl/cl_image_converter.h
@@ -63,6 +63,31 @@ class CLImageConverterFolder : public CLImageConverterBase {
  int height_of_one_block_;
 };

+class CLImageConverterNormal : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+
+  int GetCBlock() const { return c_block_; }
+
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+
 class CLImageConverterNWBlock : public CLImageConverterBase {
  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);

--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -138,19 +138,19 @@ __kernel void conv_3x3(__private const int global_size_dim0,
            int2 pos_of_weight;
            pos_of_weight.x = i * 3 + j % 3;
            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            float4 weight_x = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
            output.x += dot(input[j], weight_x);

            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            float4 weight_y = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
            output.y += dot(input[j], weight_y);

            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            float4 weight_z = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
            output.z += dot(input[j], weight_z);

            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            float4 weight_w = read_imagef(filter, sampler, pos_of_weight);
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
            output.w += dot(input[j], weight_w);
        }
 */

--- a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
@@ -19,47 +19,52 @@ __kernel void prior_box(__private const int global_size_dim0,
                        __private const int global_size_dim2,
                        __global float *box_width,
                        __global float *box_height,
-                        __write_only image2d_t output_image,
+                        __global float *variances_Buffer,
+                        __write_only image2d_t output_boxes,
+                        __write_only image2d_t output_variances,
                        __private const float step_width,
                        __private const float step_height,
                        __private const float offset,
                        __private const int img_width,
                        __private const int img_height,
                        __private const int num_priors,
-                        __private const int C){
-
+                        __private const int C,
+                        __private const int clip){

                        const int out_c = get_global_id(0);
                        const int out_nh = get_global_id(1);
                        const int out_n = out_nh/num_priors;
                        const int out_h = out_nh%num_priors;

-                        if (out_c >= global_size_dim0 ||out_nh >= global_size_dim2) {
-                             return;
-                         }
-                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                  CLK_ADDRESS_CLAMP          |
-                                                  CLK_FILTER_NEAREST;
                        int2 output_pos;
                        output_pos.x = out_c * 4;
                        output_pos.y = out_nh;
-                        float center_x0 = (offset + out_c * 4) * step_width;
-                        float center_x1 = (offset + out_c * 4 + 1) * step_width;
-                        float center_x2 = (offset + out_c * 4 + 2) * step_width;
-                        float center_x3 = (offset + out_c * 4 + 3) * step_width;
-                        float center_y = (out_n + offset) * step_height;
+                        float center_x0 = (offset + (float)(out_c * 4)) * step_width;
+                        float center_x1 = (offset + (float)(out_c * 4 + 1)) * step_width;
+                        float center_x2 = (offset + (float)(out_c * 4 + 2)) * step_width;
+                        float center_x3 = (offset + (float)(out_c * 4 + 3)) * step_width;
+                        float center_y = ((float)out_n + offset) * step_height;

                        half4 output[4];
-                        output[0].x = convert_half((center_x0 - box_width[out_h]) / img_width);
-                        output[1].x = convert_half((center_y - box_height[out_h]) / img_height);
-                        output[2].x = convert_half((center_x0 + box_width[out_h]) / img_width);
-                        output[3].x = convert_half((center_y + box_height[out_h]) / img_height);
+                        half4 variances[4];
+                        output[0].x = convert_half((center_x0 - box_width[out_h]) / (float)img_width);
+                        output[1].x = convert_half((center_y - box_height[out_h]) / (float)img_height);
+                        output[2].x = convert_half((center_x0 + box_width[out_h]) / (float)img_width);
+                        output[3].x = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].x = convert_half(variances_Buffer[0]);
+                        variances[1].x = convert_half(variances_Buffer[1]);
+                        variances[2].x = convert_half(variances_Buffer[2]);
+                        variances[3].x = convert_half(variances_Buffer[3]);

                        if(C - 4 * out_c>=2){
-                        output[0].y = convert_half((center_x1 - box_width[out_h]) / img_width);
-                        output[1].y = convert_half((center_y - box_height[out_h]) / img_height);
-                        output[2].y = convert_half((center_x1 + box_width[out_h]) / img_width);
-                        output[3].y = convert_half((center_y + box_height[out_h]) / img_height);
+                        output[0].y = convert_half((center_x1 - box_width[out_h]) / (float)img_width);
+                        output[1].y = convert_half((center_y - box_height[out_h]) / (float)img_height);
+                        output[2].y = convert_half((center_x1 + box_width[out_h]) / (float)img_width);
+                        output[3].y = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].y = convert_half(variances_Buffer[0]);
+                        variances[1].y = convert_half(variances_Buffer[1]);
+                        variances[2].y = convert_half(variances_Buffer[2]);
+                        variances[3].y = convert_half(variances_Buffer[3]);
                        }else{
                         output[0].y = 0.0f;
                         output[1].y = 0.0f;
@@ -67,10 +72,14 @@ __kernel void prior_box(__private const int global_size_dim0,
                         output[3].y = 0.0f;
                        }
                        if(C - 4 * out_c>=3){
-                        output[0].z = convert_half((center_x2 - box_width[out_h]) / img_width);
-                        output[1].z = convert_half((center_y - box_height[out_h]) / img_height);
-                        output[2].z = convert_half((center_x2 + box_width[out_h]) / img_width);
-                        output[3].z = convert_half((center_y + box_height[out_h]) / img_height);
+                        output[0].z = convert_half((center_x2 - box_width[out_h]) / (float)img_width);
+                        output[1].z = convert_half((center_y - box_height[out_h]) / (float)img_height);
+                        output[2].z = convert_half((center_x2 + box_width[out_h]) / (float)img_width);
+                        output[3].z = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].z = convert_half(variances_Buffer[0]);
+                        variances[1].z = convert_half(variances_Buffer[1]);
+                        variances[2].z = convert_half(variances_Buffer[2]);
+                        variances[3].z = convert_half(variances_Buffer[3]);
                        }else{
                        output[0].z = 0.0f;
                        output[1].z = 0.0f;
@@ -78,23 +87,41 @@ __kernel void prior_box(__private const int global_size_dim0,
                        output[3].z = 0.0f;
                        }
                        if(C - 4 * out_c>=4){
-                        output[0].w = convert_half((center_x3 - box_width[out_h]) / img_width);
-                        output[1].w = convert_half((center_y - box_height[out_h]) / img_height);
-                        output[2].w = convert_half((center_x3 + box_width[out_h]) / img_width);
-                        output[3].w = convert_half((center_y + box_height[out_h]) / img_height);
+                        output[0].w = convert_half((center_x3 - box_width[out_h]) / (float)img_width);
+                        output[1].w = convert_half((center_y - box_height[out_h]) / (float)img_height);
+                        output[2].w = convert_half((center_x3 + box_width[out_h]) / (float)img_width);
+                        output[3].w = convert_half((center_y + box_height[out_h]) / (float)img_height);
+                        variances[0].w = convert_half(variances_Buffer[0]);
+                        variances[1].w = convert_half(variances_Buffer[1]);
+                        variances[2].w = convert_half(variances_Buffer[2]);
+                        variances[3].w = convert_half(variances_Buffer[3]);
                        }else{
-                        output[0].z = 0.0f;
-                        output[1].z = 0.0f;
-                        output[2].z = 0.0f;
-                        output[3].z = 0.0f;
+                        output[0].w = 0.0f;
+                        output[1].w = 0.0f;
+                        output[2].w = 0.0f;
+                        output[3].w = 0.0f;
+                        }
+                        if(clip==1){
+                         output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                         output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                         output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
+                         output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
                        }
-                        output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[0]);
-                        write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[1]);
-                        write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[2]);
-                        write_imageh(output_image, (int2)(output_pos.x + 4, output_pos.y), output[3]);
+                        if(output_pos.x == 0 && output_pos.y == 1){
+                          float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x);
+                          printf("output = %v4hlf \n", out);
+
+                        }
+
+                        write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]);
+                        write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]);
+                        write_imageh(output_boxes, (int2)(output_pos.x + 2, output_pos.y), output[2]);
+                        write_imageh(output_boxes, (int2)(output_pos.x + 3, output_pos.y), output[3]);
+
+                        write_imageh(output_variances, (int2)(output_pos.x + 0, output_pos.y), variances[0]);
+                        write_imageh(output_variances, (int2)(output_pos.x + 1, output_pos.y), variances[1]);
+                        write_imageh(output_variances, (int2)(output_pos.x + 2, output_pos.y), variances[2]);
+                        write_imageh(output_variances, (int2)(output_pos.x + 3, output_pos.y), variances[3]);
+

 }
\ No newline at end of file
--- a/src/operators/kernel/cl/cl_kernel/reshape.cl
+++ b/src/operators/kernel/cl/cl_kernel/reshape.cl
@@ -14,26 +14,150 @@ limitations under the License. */

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

-__kernel void reshape(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const int d0,
-                      __private const int d1,
-                      __private const int d2,
-                      __private const int d3,
-                      __private const int x0,
-                      __private const int x1,
-                      __private const int x2,
-                      __private const int x3) {
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
+__kernel void reshape(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_C,
+                      __private const int out_H,
+                      __private const int out_W,
+                      __private const int in_W,
+                      __private const int in_H,
+                      __private const int in_Stride0,
+                      __private const int in_Stride1,
+                      __private const int in_Stride2,
+                      __private const int out_Stride0,
+                      __private const int out_Stride1,
+                      __private const int out_Stride2) {

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+                       const int out_c = get_global_id(0);
+                       const int out_w = get_global_id(1);
+                       const int out_nh = get_global_id(2);
+                       const int out_n = out_nh/out_H;
+                       const int out_h = out_nh%out_H;
+                       const int out_c0 = out_c * 4;
+                       const int out_c1 = out_c * 4 + 1;
+                       const int out_c2 = out_c * 4+ 2;
+                       const int out_c3 = out_c * 4+ 3;
+
+                       int count0 =  out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
+                       int count1 =  out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
+                       int count2 =  out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
+                       int count3 =  out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
+
+                       int in_n0 = count0/in_Stride2;
+                       int in_n1 = count1/in_Stride2;
+                       int in_n2 = count1/in_Stride2;
+                       int in_n3 = count2/in_Stride2;
+
+                       count0 = count0%in_Stride2;
+                       count1 = count1%in_Stride2;
+                       count2 = count2%in_Stride2;
+                       count3 = count3%in_Stride2;
+
+                       int in_c0 = count0/in_Stride1;
+                       int in_c1 = count1/in_Stride1;
+                       int in_c2 = count2/in_Stride1;
+                       int in_c3 = count3/in_Stride1;
+
+                       int in_h0 = (count0%in_Stride1)/in_Stride0;
+                       int in_h1 = (count1%in_Stride1)/in_Stride0;
+                       int in_h2 = (count2%in_Stride1)/in_Stride0;
+                       int in_h3 = (count3%in_Stride1)/in_Stride0;
+
+                       int in_w0 = (count0%in_Stride1)%in_Stride0;
+                       int in_w1 = (count1%in_Stride1)%in_Stride0;
+                       int in_w2 = (count2%in_Stride1)%in_Stride0;
+                       int in_w3 = (count3%in_Stride1)%in_Stride0;
+
+
+                       int2 input_pos0;
+                       int2 input_pos1;
+                       int2 input_pos2;
+                       int2 input_pos3;
+
+                       input_pos0.x = (in_c0/4) * in_W + in_w0;
+                       input_pos0.y = in_n0 * in_H + in_h0;
+
+                       input_pos1.x = (in_c1/4) * in_W + in_w1;
+                       input_pos1.y = in_n1 * in_H + in_h1;
+
+                       input_pos2.x = (in_c2/4) * in_W + in_w2;
+                       input_pos2.y = in_n2 * in_H + in_h2;
+
+                       input_pos3.x = (in_c3/4) * in_W + in_w3;
+                       input_pos3.y = in_n3 * in_H + in_h3;
+
+                       int2 output_pos;
+                       output_pos.x = out_c * out_W + out_w;
+                       output_pos.y = out_nh;
+
+                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+
+                       half4 input0;
+                       half4 input1;
+                       half4 input2;
+                       half4 input3;
+                       half4 output;
+
+                       input0 = read_imageh(input_image, sampler,input_pos0);
+                       if(in_c0%4==0){
+                          output.x = input0.x;
+                       }else if(in_c0%4==1){
+                          output.x = input0.y;
+                       }else if(in_c0%4==2){
+                          output.x = input0.z;
+                       }else{
+                          output.x = input0.w;
+                       }
+                       if(out_C - out_c * 4>=2){
+                          input1 = read_imageh(input_image, sampler,input_pos1);
+                       if(in_c1%4==0){
+                          output.y = input1.x;
+                       }else if(in_c1%4==1){
+                          output.y = input1.y;
+                       }else if(in_c1%4==2){
+                          output.y = input1.z;
+                       }else{
+                          output.y = input1.w;
+                       }
+
+                       }else{
+                          output.y = 0.0f;
+                       }
+
+                       if(out_C - out_c * 4>=3){
+                          input2 = read_imageh(input_image, sampler,input_pos2);
+
+                       if(in_c2%4==0){
+                          output.z = input2.x;
+                       }else if(in_c2%4==1){
+                          output.z = input1.y;
+                       }else if(in_c2%4==2){
+                          output.z = input2.z;
+                       }else{
+                          output.z = input2.w;
+                       }
+                       }else{
+                          output.z = 0.0f;
+                       }

-  half4 in = read_imageh(input, sampler, (int2)(x, y));
+                       if(out_C - out_c * 4>=4){
+                          input3 = read_imageh(input_image, sampler,input_pos3);
+                       if(in_c3%4==0){
+                          output.w = input3.x;
+                       }else if(in_c3%4==1){
+                          output.w = input3.y;
+                       }else if(in_c3%4==2){
+                          output.w = input3.z;
+                       }else{
+                          output.w = input3.w;
+                       }
+                       }else{
+                          output.w = 0.0f;
+                       }

-  write_imageh(output, (int2)(x, y), in);
+                       write_imageh(output_image, output_pos, output);
 }



--- a/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void transpose_4d( __read_only image2d_t input_image,
+                            __write_only image2d_t output_image,
+                            __private const int out_C,
+                            __private const int out_H,
+                            __private const int out_W,
+                            __private const int in_W
+                           ){
+                           const int out_c = get_global_id(0);
+                           const int out_w = get_global_id(1);
+                           const int out_nh = get_global_id(2);
+                           const int out_n =  1;
+                           const int out_h = out_nh%out_H;
+                           const int out_c0 = out_c * 4;
+                           const int out_c1 = out_c * 4 + 1;
+                           const int out_c2 = out_c * 4+ 2;
+                           const int out_c3 = out_c * 4+ 3;
+
+                           const int in_n = out_n;
+                           const int in_c = out_w / 4;
+                           const int in_h0 = out_c0;
+                           const int in_h1 = out_c1;
+                           const int in_h2 = out_c2;
+                           const int in_h3 = out_c3;
+                           const int in_w = out_h;
+
+                           int2 output_pos;
+                           output_pos.x = out_c * out_W + out_w;
+                           output_pos.y = out_nh;
+
+                           int2 input_pos0;
+                           int2 input_pos1;
+                           int2 input_pos2;
+                           int2 input_pos3;
+
+                           input_pos0.x = in_W * in_c + in_w;
+                           input_pos0.y = in_n * in_h0;
+
+                           input_pos1.x = in_W * in_c + in_w;
+                           input_pos1.y = in_n * in_h1;
+
+                           input_pos2.x = in_W * in_c + in_w;
+                           input_pos2.y = in_n * in_h2;
+
+                           input_pos3.x = in_W * in_c + in_w;
+                           input_pos3.y = in_n * in_h3;
+
+                           const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                         CLK_ADDRESS_CLAMP      |
+                                                         CLK_FILTER_NEAREST;
+
+                           half4 input0;
+                           half4 input1;
+                           half4 input2;
+                           half4 input3;
+                           half4 output;
+                           input0 = read_imageh(input_image, sampler,input_pos0);
+
+                            if(out_w%4==0){
+                            output.x = input0.x;
+                           }else if(out_w%4==1){
+                            output.x = input0.y;
+                           }else if(out_w%4==2){
+                            output.x = input0.z;
+                           }else{
+                            output.x = input0.w;
+                           }
+                           if(out_C - out_c * 4>=2){
+                            input1 = read_imageh(input_image, sampler,input_pos1);
+                            if(out_w%4==0){
+                            output.y = input1.x;
+                            }else if(out_w%4==1){
+                             output.y = input1.y;
+                            }else if(out_w%4==2){
+                             output.y = input1.z;
+                            }else{
+                             output.y = input1.w;
+                            }
+
+                           }else{
+                           output.y = 0.0f;
+                           }
+
+                           if(out_C - out_c * 4>=3){
+                            input2 = read_imageh(input_image, sampler,input_pos2);
+
+                          if(out_w%4==0){
+                            output.z = input2.x;
+                            }else if(out_w%4==1){
+                             output.z = input1.y;
+                            }else if(out_w%4==2){
+                             output.z = input2.z;
+                            }else{
+                             output.z = input2.w;
+                            }
+                           }else{
+                            output.z = 0.0f;
+                           }
+
+                           if(out_C - out_c * 4>=4){
+                            input3 = read_imageh(input_image, sampler,input_pos3);
+                           if(out_w%4==0){
+                            output.w = input3.x;
+                            }else if(out_w%4==1){
+                             output.w = input3.y;
+                            }else if(out_w%4==2){
+                             output.w = input3.z;
+                            }else{
+                             output.w = input3.w;
+                            }
+                           }else{
+                           output.w = 0.0f;
+                           }
+                           write_imageh(output_image, output_pos, output);
+}
\ No newline at end of file
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
@@ -22,11 +22,11 @@ namespace operators {

 template <>
 bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
-  if (param->InputX()->dims().size() <= 2) {
-    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
-  } else {
-    this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  }
+  //  if (param->InputX()->dims().size() <= 2) {
+  //    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
+  //  } else {
+  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  //  }
  return true;
 }

@@ -49,11 +49,11 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {

  C = new_dims[1];
  in_height = new_dims[2];
-  if (dim.size() <= 2) {
-    in_width = param.InputX()->ImageWidth();
-  } else {
-    in_width = new_dims[3];
-  }
+  //  if (dim.size() <= 2) {
+  //    in_width = param.InputX()->ImageWidth();
+  //  } else {
+  in_width = new_dims[3];
+  //  }

  CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
                         this->cl_helper_.CLCommandQueue());
@@ -64,16 +64,16 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
  clSetKernelArg(kernel, 1, sizeof(int), &in_width);
  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
  clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  if (dim.size() > 2) {
-    int size_ch = in_height * in_width;
-    int size_block = size_ch * 4;
-    int size_batch = size_ch * C;
-    int out_c = new_dims[1];
-    clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
-    clSetKernelArg(kernel, 5, sizeof(int), &size_block);
-    clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
-    clSetKernelArg(kernel, 7, sizeof(int), &out_c);
-  }
+  //  if (dim.size() > 2) {
+  int size_ch = in_height * in_width;
+  int size_block = size_ch * 4;
+  int size_batch = size_ch * C;
+  int out_c = new_dims[1];
+  clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
+  clSetKernelArg(kernel, 5, sizeof(int), &size_block);
+  clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
+  clSetKernelArg(kernel, 7, sizeof(int), &out_c);
+  //  }

  //  cl_event wait_event = param.InpdutX()->GetClEvent();
  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
@@ -93,8 +93,6 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
  //            << "ms" << std::endl;

  memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
-  DLOG << *param.InputX();
-  DLOG << *out;
 }

 template class FetchKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/prior_box_kernel.cpp
+++ b/src/operators/kernel/cl/prior_box_kernel.cpp
@@ -39,6 +39,10 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
  const auto &input_aspect_ratio = param.AspectRatios();
  const bool &flip = param.Flip();
  const bool &clip = param.Clip();
+  int isclip = 0;
+  if (clip) {
+    isclip = 1;
+  }
  const float &step_w = param.StepW();
  const float &step_h = param.StepH();
  const float &offset = param.Offset();
@@ -75,6 +79,8 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
  float *box_height = static_cast<float *>(
      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
+  float *variancesptr =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * 4));
  int idx = 0;
  for (size_t s = 0; s < min_sizes.size(); ++s) {
    auto min_size = min_sizes[s];
@@ -108,6 +114,9 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
      }
    }
  }
+  for (int i = 0; i < variances.size(); i++) {
+    variancesptr[i] = variances[i];
+  }
  cl_int status;
  auto kernel = this->cl_helper_.KernelAt(0);
  auto default_work_size =
@@ -116,7 +125,7 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
  int w = default_work_size[1];
  int nh = default_work_size[2];

-  std::vector<int64_t> box_shape({1, 1, 1, num_priors});
+  std::vector<int64_t> box_shape({num_priors});
  framework::DDim ddim = framework::make_ddim(box_shape);

  framework::CLTensor box_width_cl_tensor(this->cl_helper_.CLContext(),
@@ -131,16 +140,33 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
  cl_mem box_height_Buffer =
      box_height_cl_tensor.mutable_with_data<float>(box_height);

-  DLOG << "c_block:" << c_block;
-  DLOG << "w:" << w;
-  DLOG << "nh:" << nh;
-  DLOG << "step_width:" << step_width;
-  DLOG << "step_height:" << step_height;
-  DLOG << "offset:" << offset;
-  DLOG << "img_width:" << img_width;
-  DLOG << "img_height:" << img_height;
-  DLOG << "num_priors:" << num_priors;
-  DLOG << "C:" << C;
+  framework::CLTensor variances_cl_tensor(this->cl_helper_.CLContext(),
+                                          this->cl_helper_.CLCommandQueue());
+
+  std::vector<int64_t> variances_shape({4});
+  framework::DDim vddim = framework::make_ddim(variances_shape);
+
+  variances_cl_tensor.Resize(vddim);
+  cl_mem variances_Buffer =
+      variances_cl_tensor.mutable_with_data<float>(variancesptr);
+
+  //            DLOG << "c_block:" << c_block;
+  //            DLOG << "w:" << w;
+  //            DLOG << "nh:" << nh;
+  //            DLOG << "step_width:" << step_width;
+  //            DLOG << "step_height:" << step_height;
+  //            DLOG << "offset:" << offset;
+  //            DLOG << "img_width:" << img_width;
+  //            DLOG << "img_height:" << img_height;
+  //            DLOG << "num_priors:" << num_priors;
+  //            DLOG << "C:" << C;
+  //            DLOG << "isclip:" << isclip;
+  //            printf("param.MinMaxAspectRatiosOrder() =
+  //            %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i <
+  //            num_priors; i++) {
+  //                DLOG << box_width[i];
+  //                DLOG << box_height[i];
+  //            }
  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
@@ -151,28 +177,36 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer);
  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output_boxes);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &variances_Buffer);
  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(float), &step_width);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output_boxes);
  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(float), &step_height);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_variances);
  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(float), &offset);
+  status = clSetKernelArg(kernel, 8, sizeof(float), &step_width);
  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &img_width);
+  status = clSetKernelArg(kernel, 9, sizeof(float), &step_height);
  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &img_height);
+  status = clSetKernelArg(kernel, 10, sizeof(float), &offset);
  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &num_priors);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &img_width);
  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &C);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &img_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &num_priors);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 14, sizeof(int), &C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 15, sizeof(int), &isclip);
  CL_CHECK_ERRORS(status);
  size_t global_work_size[2] = {c_block, nh};
  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                  NULL, global_work_size, NULL, 0, NULL, NULL);
  CL_CHECK_ERRORS(status);
+
  paddle_mobile::memory::Free(box_width);
  paddle_mobile::memory::Free(box_height);
+  paddle_mobile::memory::Free(variancesptr);
 }
 template class PriorBoxKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/reshape_kernel.cpp
+++ b/src/operators/kernel/cl/reshape_kernel.cpp
@@ -26,40 +26,76 @@ bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
 template <>
 void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
  const auto *input = param.InputX();
  auto *output = param.Out();
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  auto input_image = input->GetCLImage();
+  auto output_image = output->GetCLImage();
  const auto &inputDim = input->dims();
  const auto &outputDim = output->dims();
-  int dims[4] = {1, 1, 1, 1};
-  int odims[4] = {1, 1, 1, 1};
+  int input_dims[4] = {1, 1, 1, 1};
+  int output_dims[4] = {1, 1, 1, 1};
  // 1 1000 1 1
  for (int i = 0; i < inputDim.size(); i++) {
-    dims[4 - inputDim.size() + i] = inputDim[i];
+    input_dims[4 - inputDim.size() + i] = inputDim[i];
  }

  // 1 1 1 1000
  for (int i = 0; i < outputDim.size(); i++) {
-    odims[4 - outputDim.size() + i] = outputDim[i];
+    output_dims[4 - outputDim.size() + i] = outputDim[i];
  }
-  clSetKernelArg(kernel, 2, sizeof(cl_int), &dims);
-  clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]);
-  clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]);
-  clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]);
-  clSetKernelArg(kernel, 6, sizeof(cl_int), &odims);
-  clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]);
-  clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]);
-  clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]);
-  const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};

-  //  cl_event out_event = param.Out()->GetClEvent();
-  //  cl_event wait_event = param.InputX()->GetClEvent();
-
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
-                         work_size, NULL, 0, NULL, NULL);
+  int out_C = output_dims[1];
+  int out_H = output_dims[2];
+  int out_W = output_dims[3];
+  int in_W = input_dims[3];
+  int in_H = input_dims[2];
+  int in_Stride0 = in_W;
+  int in_Stride1 = input_dims[2] * input_dims[3];
+  int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3];
+  int out_Stride0 = out_W;
+  int out_Stride1 = out_H * out_W;
+  int out_Stride2 = out_C * out_H * out_W;
+  DLOG << "out_C=" << out_C;
+  DLOG << "out_H=" << out_H;
+  DLOG << "out_W=" << out_W;
+  DLOG << "in_W=" << in_W;
+  DLOG << "default_work_size=" << default_work_size;
+  DLOG << "in_Stride0=" << in_Stride0;
+  DLOG << "in_Stride1=" << in_Stride1;
+  DLOG << "out_Stride0=" << out_Stride0;
+  DLOG << "out_Stride1=" << out_Stride1;
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &in_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2);
+  CL_CHECK_ERRORS(status);
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
 }

 template class ReshapeKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/transpose_kernel.cpp
+++ b/src/operators/kernel/cl/transpose_kernel.cpp
@@ -20,12 +20,48 @@ namespace operators {

 template <>
 bool TransposeKernel<GPU_CL, float>::Init(TransposeParam<GPU_CL> *param) {
+  if (param->Out()->dims().size() == 4) {
+    this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl");
+  }
  return true;
 }

 template <>
 void TransposeKernel<GPU_CL, float>::Compute(
-    const TransposeParam<GPU_CL> &param) {}
+    const TransposeParam<GPU_CL> &param) {
+  if (param.Out()->dims().size() == 4) {
+    auto kernel = this->cl_helper_.KernelAt(0);
+    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+    int out_C = param.Out()->dims()[1];
+    int out_H = param.Out()->dims()[2];
+    int out_W = param.Out()->dims()[3];
+    int in_W = param.InputX()->dims()[3];
+    auto output_image = param.Out()->GetCLImage();
+    auto input_image = param.InputX()->GetCLImage();
+    DLOG << "out_C=" << out_C;
+    DLOG << "out_H=" << out_H;
+    DLOG << "out_W=" << out_W;
+    DLOG << "in_C=" << in_W;
+    DLOG << "default_work_size=" << default_work_size;
+    cl_int status;
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
+    CL_CHECK_ERRORS(status);
+    status = clEnqueueNDRangeKernel(
+        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  }
+}

 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -849,6 +849,8 @@ class PriorBoxParam : public OpParam {
    if (HasAttr("min_max_aspect_ratios_order", attrs)) {
      min_max_aspect_ratios_order_ =
          GetAttr<bool>("min_max_aspect_ratios_order", attrs);
+    } else {
+      min_max_aspect_ratios_order_ = false;
    }
    flip_ = GetAttr<bool>("flip", attrs);
    clip_ = GetAttr<bool>("clip", attrs);

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -366,5 +366,8 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
    target_link_libraries(test-eng paddle-mobile)

+    # gen test
+    ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
+    target_link_libraries(test-super paddle-mobile)
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
--- a/test/net/test_super.cpp
+++ b/test/net/test_super.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../../src/common/types.h"
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  //    paddle_mobile.SetThreadNum(4);
+  auto time1 = paddle_mobile::time();
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#endif
+
+  auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
+                                 std::string(g_super) + "/params", true, false,
+                                 1, true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 1, 300, 300};
+    GetInput<float>(g_yolo_img, &input, dims);
+
+    std::vector<float> vec_result;
+
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -36,16 +36,19 @@ static const char *g_squeezenet = "../models/squeezenet";
 static const char *g_googlenet = "../models/googlenet";
 static const char *g_googlenet_quali = "../models/googlenet_combine_quali";
 static const char *g_mobilenet = "../models/mobilenet";
-static const char *g_mobilenet_mul = "../models/mobilenet_mul";
+static const char *g_mobilenet_mul = "../models/r";
 static const char *g_alexnet = "../models/alexnet";
 static const char *g_inceptionv4 = "../models/inceptionv4";
+static const char *g_inceptionv3 =
+    "../models/InceptionV3_Spatial_Attention_Model";
 static const char *g_nlp = "../models/nlp";
+static const char *g_super = "../models/superresoltion";
 static const char *g_resnet_50 = "../models/resnet_50";
 static const char *g_resnet = "../models/resnet";
 static const char *g_googlenet_combine = "../models/googlenet_combine";
 static const char *g_yolo = "../models/yolo";
 static const char *g_yolo_combined = "../models/yolo_combined";
-static const char *g_yolo_mul = "../models/yolo_mul";
+static const char *g_yolo_mul = "../models/d";
 static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
 static const char *g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";