diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
index 5dc264f7e5b0e276c37566393acc355d83c4fed7..6fe5596a4cf5cbce5b50c9a3d53be164aad8a0b5 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
@@ -1,28 +1,28 @@
 #include <cl_common.h>
 
-__kernel void conv2d_1x1(
-    __private const int global_size_dim0,
-    __private const int global_size_dim1,
-    __private const int global_size_dim2,
-    __read_only image2d_t input_image,
-    __read_only image2d_t filter,
+__kernel void conv2d_1x1(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
+                         __read_only image2d_t bias,
 #endif
-    #ifdef BATCH_NORM
-        __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-    #endif
-    __write_only image2d_t output_image,
-    __private const int stride,
-    __private const int offset,
-    __private const int input_c,
-    __private const int input_c_origin,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w) {
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int input_c_origin,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height,
+                         __private const int old_w) {
   CL_DTYPE zero = 0.0f;
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
@@ -61,12 +61,13 @@ __kernel void conv2d_1x1(
       ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
 
 #ifdef BIASE_CH
-  CL_DTYPE4 output0 = read_imagef(bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output0 =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
   CL_DTYPE4 output1 = output0;
   CL_DTYPE4 output2 = output0;
   CL_DTYPE4 output3 = output0;
 #elif defined(BIASE_ELE)
-  CL_DTYPE4 output0 = read_imagef(bias, sampler, output_pos0);
+  CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
   CL_DTYPE4 output1 = output0;
   CL_DTYPE4 output2 = output0;
   CL_DTYPE4 output3 = output0;
@@ -89,12 +90,17 @@ __kernel void conv2d_1x1(
     // ------------0---------------
     int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
                          in_pos_in_one_block0.y);
-    CL_DTYPE4 input0 = read_imagef(input_image, sampler, pos_in);
-
-    CL_DTYPE4 weight0 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 0));
-    CL_DTYPE4 weight1 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 1));
-    CL_DTYPE4 weight2 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 2));
-    CL_DTYPE4 weight3 = read_imagef(filter, sampler, (int2)(out_c, i * 4 + 3));
+    CL_DTYPE4 input0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+
+    CL_DTYPE4 weight0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
+    CL_DTYPE4 weight1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
+    CL_DTYPE4 weight2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
+    CL_DTYPE4 weight3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
     int bound_gap = max_w_bound - pos_in.x - 1;
 
     bool outof_bound = bound_gap < input_width && bound_gap >= 0;
@@ -109,7 +115,8 @@ __kernel void conv2d_1x1(
     // -------------1--------------
     pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
                     in_pos_in_one_block1.y);
-    CL_DTYPE4 input1 = read_imagef(input_image, sampler, pos_in);
+    CL_DTYPE4 input1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
 
     bound_gap = max_w_bound - pos_in.x - 1;
 
@@ -126,7 +133,8 @@ __kernel void conv2d_1x1(
     // -------------2--------------
     pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
                     in_pos_in_one_block2.y);
-    CL_DTYPE4 input2 = read_imagef(input_image, sampler, pos_in);
+    CL_DTYPE4 input2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
 
     bound_gap = max_w_bound - pos_in.x - 1;
 
@@ -143,7 +151,8 @@ __kernel void conv2d_1x1(
     // -------------3--------------
     pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
                     in_pos_in_one_block3.y);
-    CL_DTYPE4 input3 = read_imagef(input_image, sampler, pos_in);
+    CL_DTYPE4 input3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
     bound_gap = max_w_bound - pos_in.x - 1;
 
     outof_bound = bound_gap < input_width && bound_gap >= 0;
@@ -165,17 +174,21 @@ __kernel void conv2d_1x1(
   }
 
 #ifdef BATCH_NORM
-  output0 = output0 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imagef(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 
-    output1 = output1 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imagef(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 
-    output2 = output2 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imagef(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 
-    output3 = output3 * read_imagef(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imagef(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 #endif
 
 #ifdef RELU
@@ -186,18 +199,18 @@ __kernel void conv2d_1x1(
 #endif
 
   if (out_w0 < old_w) {
-    write_imagef(output_image, output_pos0, output0);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
   }
 
   if (out_w1 < old_w) {
-    write_imagef(output_image, output_pos1, output1);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
   }
 
   if (out_w2 < old_w) {
-    write_imagef(output_image, output_pos2, output2);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
   }
 
   if (out_w3 < old_w) {
-    write_imagef(output_image, output_pos3, output3);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
   }
 }
diff --git a/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..314be875d29d2125f9573d33010ee9d33317ea71
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
@@ -0,0 +1,162 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void reshape(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_C,
+                      __private const int out_H,
+                      __private const int out_W,
+                      __private const int in_W,
+                      __private const int in_H,
+                      __private const int in_Stride0,
+                      __private const int in_Stride1,
+                      __private const int in_Stride2,
+                      __private const int out_Stride0,
+                      __private const int out_Stride1,
+                      __private const int out_Stride2) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_H;
+  const int out_h = out_nh % out_H;
+  const int out_c0 = out_c * 4;
+  const int out_c1 = out_c * 4 + 1;
+  const int out_c2 = out_c * 4 + 2;
+  const int out_c3 = out_c * 4 + 3;
+
+  int count0 =
+      out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
+  int count1 =
+      out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
+  int count2 =
+      out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
+  int count3 =
+      out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
+
+  int in_n0 = count0 / in_Stride2;
+  int in_n1 = count1 / in_Stride2;
+  int in_n2 = count1 / in_Stride2;
+  int in_n3 = count2 / in_Stride2;
+
+  count0 = count0 % in_Stride2;
+  count1 = count1 % in_Stride2;
+  count2 = count2 % in_Stride2;
+  count3 = count3 % in_Stride2;
+
+  int in_c0 = count0 / in_Stride1;
+  int in_c1 = count1 / in_Stride1;
+  int in_c2 = count2 / in_Stride1;
+  int in_c3 = count3 / in_Stride1;
+
+  int in_h0 = (count0 % in_Stride1) / in_Stride0;
+  int in_h1 = (count1 % in_Stride1) / in_Stride0;
+  int in_h2 = (count2 % in_Stride1) / in_Stride0;
+  int in_h3 = (count3 % in_Stride1) / in_Stride0;
+
+  int in_w0 = (count0 % in_Stride1) % in_Stride0;
+  int in_w1 = (count1 % in_Stride1) % in_Stride0;
+  int in_w2 = (count2 % in_Stride1) % in_Stride0;
+  int in_w3 = (count3 % in_Stride1) % in_Stride0;
+
+  int2 input_pos0;
+  int2 input_pos1;
+  int2 input_pos2;
+  int2 input_pos3;
+
+  input_pos0.x = (in_c0 / 4) * in_W + in_w0;
+  input_pos0.y = in_n0 * in_H + in_h0;
+
+  input_pos1.x = (in_c1 / 4) * in_W + in_w1;
+  input_pos1.y = in_n1 * in_H + in_h1;
+
+  input_pos2.x = (in_c2 / 4) * in_W + in_w2;
+  input_pos2.y = in_n2 * in_H + in_h2;
+
+  input_pos3.x = (in_c3 / 4) * in_W + in_w3;
+  input_pos3.y = in_n3 * in_H + in_h3;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input0;
+  CL_DTYPE4 input1;
+  CL_DTYPE4 input2;
+  CL_DTYPE4 input3;
+  CL_DTYPE4 output;
+
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
+  if (in_c0 % 4 == 0) {
+    output.x = input0.x;
+  } else if (in_c0 % 4 == 1) {
+    output.x = input0.y;
+  } else if (in_c0 % 4 == 2) {
+    output.x = input0.z;
+  } else {
+    output.x = input0.w;
+  }
+  if (out_C - out_c * 4 >= 2) {
+    input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
+    if (in_c1 % 4 == 0) {
+      output.y = input1.x;
+    } else if (in_c1 % 4 == 1) {
+      output.y = input1.y;
+    } else if (in_c1 % 4 == 2) {
+      output.y = input1.z;
+    } else {
+      output.y = input1.w;
+    }
+
+  } else {
+    output.y = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 3) {
+    input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
+
+    if (in_c2 % 4 == 0) {
+      output.z = input2.x;
+    } else if (in_c2 % 4 == 1) {
+      output.z = input1.y;
+    } else if (in_c2 % 4 == 2) {
+      output.z = input2.z;
+    } else {
+      output.z = input2.w;
+    }
+  } else {
+    output.z = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 4) {
+    input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
+    if (in_c3 % 4 == 0) {
+      output.w = input3.x;
+    } else if (in_c3 % 4 == 1) {
+      output.w = input3.y;
+    } else if (in_c3 % 4 == 2) {
+      output.w = input3.z;
+    } else {
+      output.w = input3.w;
+    }
+  } else {
+    output.w = 0.0f;
+  }
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 13f527a2200198569054f2314e190b958973004e..3423b1e920e5e7c4aaa34125303b09d943e47b62 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -15,6 +15,7 @@ add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${te
 add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
 
@@ -51,7 +52,9 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
 lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc
         DEPS conv2d_1x1_opencl cl_image_converter op_registry program context
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
+lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
+        DEPS reshape_opencl cl_image_converter op_registry program context
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
         DEPS conv_opencl op_registry program context
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
diff --git a/lite/kernels/opencl/reshape_compute.cc b/lite/kernels/opencl/reshape_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7af648c5601e0a516eb92b3090cb8d7e836a5447
--- /dev/null
+++ b/lite/kernels/opencl/reshape_compute.cc
@@ -0,0 +1,206 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+// reshape operator
+class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
+                                                   PRECISION(kFloat),
+                                                   DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ReshapeParam;
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/reshape_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    VLOG(4) << "reshape_compute run ... ";
+
+    auto& param = *param_.get_mutable<param_t>();
+    const Tensor* const x = param.x;
+
+    const auto x_dims = x->dims();
+    const std::map<std::string, size_t>& input_image_shape =
+        InitImageDimInfoWith(x_dims);
+
+    const int64_t& input_image_width = input_image_shape.at("width");
+    const int64_t& input_image_height = input_image_shape.at("height");
+
+    const cl::Image2D* const x_image = x->data<float, cl::Image2D>();
+
+    const std::vector<int>& shape_vct = param.shape_vct;
+    Tensor* const output = param.output;
+    const DDimLite& out_dims = output->dims();
+    VLOG(4) << "out_dims= " << out_dims;
+
+    const std::map<std::string, size_t>& out_image_shape =
+        InitImageDimInfoWith(out_dims);
+    cl::Image2D* const out_image = output->mutable_data<float, cl::Image2D>(
+        out_image_shape.at("width"), out_image_shape.at("height"));
+    LOG(INFO) << "out_dims=   " << out_dims;
+
+    const std::vector<size_t>& default_work_size = DefaultWorkSize(
+        out_dims,
+        DDim(std::vector<DDim::value_type>{
+            static_cast<int64_t>(out_image_shape.at("width")),
+            static_cast<int64_t>(out_image_shape.at("height"))}));
+
+    int x_v_dims[4] = {1, 1, 1, 1};
+    int out_v_dims[4] = {1, 1, 1, 1};
+    // 1 1000 1 1
+    for (int i = 0; i < x_dims.size(); i++) {
+      x_v_dims[4 - x_dims.size() + i] = x_dims[i];
+    }
+    // 1 1 1 1000
+    for (int i = 0; i < out_dims.size(); i++) {
+      out_v_dims[4 - out_dims.size() + i] = out_dims[i];
+    }
+
+    int out_C = out_v_dims[1];
+    int out_H = out_v_dims[2];
+    int out_W = out_v_dims[3];
+    int in_W = x_v_dims[3];
+    int in_H = x_v_dims[2];
+    int in_Stride0 = in_W;
+    int in_Stride1 = x_v_dims[2] * x_v_dims[3];
+    int in_Stride2 = x_v_dims[1] * x_v_dims[2] * x_v_dims[3];
+    int out_Stride0 = out_W;
+    int out_Stride1 = out_H * out_W;
+    int out_Stride2 = out_C * out_H * out_W;
+    VLOG(4) << "out_C=" << out_C;
+    VLOG(4) << "out_H=" << out_H;
+    VLOG(4) << "out_W=" << out_W;
+    VLOG(4) << "in_W=" << in_W;
+    VLOG(4) << "default_work_size= " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+    VLOG(4) << "in_Stride0=" << in_Stride0;
+    VLOG(4) << "in_Stride1=" << in_Stride1;
+    VLOG(4) << "out_Stride0=" << out_Stride0;
+    VLOG(4) << "out_Stride1=" << out_Stride1;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << TargetToStr(x->target());
+    VLOG(4) << TargetToStr(param.output->target());
+
+    int arg_idx = 0;
+
+    cl_int status;
+    status = kernel.setArg(arg_idx, *x_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_C);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_H);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_W);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_W);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_H);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_Stride0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_Stride1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_Stride2);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_Stride0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_Stride1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_Stride2);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                    static_cast<size_t>(default_work_size.data()[1]),
+                    static_cast<size_t>(default_work_size.data()[2])};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_image, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"reshape"};
+  std::string build_options_{"-DCL_DTYPE_float "};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reshape,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(reshape2,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/reshape_compute_test.cc b/lite/kernels/opencl/reshape_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5ba1c118e7fa952fe1172080ee97555a82c7260
--- /dev/null
+++ b/lite/kernels/opencl/reshape_compute_test.cc
@@ -0,0 +1,227 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/operators/reshape_op.h"
+#include "lite/utils/logging.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+static DDim ValidateShape(const std::vector<int>& shape,
+                          const DDim& input_dims) {
+  const lite::DDim::value_type input_size = input_dims.production();
+  auto input_shape = input_dims.Vectorize();
+  bool all_positive = std::all_of(
+      input_shape.cbegin(), input_shape.cend(), [](lite::DDim::value_type i) {
+        return i > 0;
+      });
+  // only one dimension can be set to -1, whose size will be automatically
+  // infered.
+  const int unk_dim_val = -1;
+  const int copy_dim_val = 0;
+
+  std::vector<lite::DDim::value_type> output_shape(shape.size(), 0);
+  lite::DDim::value_type capacity = 1;
+  int unk_dim_idx = -1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (shape[i] == unk_dim_val) {
+      CHECK_EQ(unk_dim_idx, -1)
+          << "Only one input dimension of Attr(shape) can be unknown.";
+      unk_dim_idx = i;
+    } else if (shape[i] == copy_dim_val) {
+      CHECK_LT(static_cast<int>(i), input_shape.size())
+          << "The index of dimension to copy from input shape must be less "
+             "than the size of input shape.";
+    } else {
+      CHECK_GT(shape[i], 0) << "Each input dimension of Attr(shape) must not "
+                               "be negtive except one unknown dimension.";
+    }
+
+    capacity *= (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
+                          : input_shape[i]);
+    output_shape[i] = (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
+                                : input_shape[i]);
+  }
+
+  if (unk_dim_idx != -1) {
+    if (all_positive) {
+      // input_size < 0 and is un-determinate in compile time, skip the check,
+      // for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
+      // capacity = -24, input_size = -8, output_shape[0] = 0
+      // the following check will fail.
+      output_shape[unk_dim_idx] = -input_size / capacity;
+      CHECK_EQ(output_shape[unk_dim_idx] * capacity, -input_size)
+          << "Invalid shape is given.";
+    } else {
+      output_shape[unk_dim_idx] = -1;
+    }
+  } else {
+    CHECK_EQ(capacity, input_size) << "Invalid shape is given.";
+  }
+  return lite::DDim(output_shape);
+}
+
+TEST(reshape_opencl, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "reshape", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "created reshape kernel";
+
+  LOG(INFO) << "prepare kernel ------";
+
+  int64_t batch_size = 1;
+  int64_t ic = 2;
+  int64_t ih = 4;
+  int64_t iw = 6;
+
+  lite::Tensor input, output;
+
+  operators::ReshapeParam param;
+
+  Tensor shape_tensor;
+  shape_tensor.Resize({2});
+  auto* shape_tensor_data = shape_tensor.mutable_data<int>();
+  shape_tensor_data[0] = 6;
+  shape_tensor_data[1] = 8;
+
+  param.x = &input;
+  param.shape_tensor = &shape_tensor;  // use shape_tensor
+  param.inplace = false;
+  param.output = &output;
+
+  const DDim input_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+  input.Resize(input_dim);
+
+  std::vector<int> final_shape = std::vector<int>(
+      shape_tensor_data, shape_tensor_data + shape_tensor.numel());
+
+  auto output_dim = ValidateShape(final_shape, input_dim);
+  param.output->Resize(output_dim);
+  LOG(INFO) << " output_dim------" << output_dim;
+
+  LOG(INFO) << "prepare kernel SetParam------";
+  kernel->SetParam(param);
+
+  size_t input_image_width = iw * ((ic + 3) / 4);
+  size_t input_image_height = ih * batch_size;
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  //  LOG(INFO) << "map input ...";
+  //  auto* mapped_input =
+  //      static_cast<float*>(TargetWrapperCL::MapImage(input_data,
+  //                                                    input_image_width,
+  //                                                    input_image_height,
+  //                                                    cl_image2d_row_pitch,
+  //                                                    cl_image2d_slice_pitch));
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> gen(-5, 5);
+  std::vector<float> input_v(batch_size * ic * ih * iw);
+
+  LOG(INFO) << "gen input ...";
+
+  float* input_v_data = &input_v[0];
+  for (auto& i : input_v) {
+    i = gen(engine);
+  }
+  paddle::lite::CLImageConverterDefault default_convertor;
+
+  std::vector<float> x_image_data(input_image_width * input_image_height *
+                                  4);  // 4 : RGBA
+
+  LOG(INFO) << "set mapped input  ...";
+  default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
+
+  auto* input_image = input.mutable_data<float, cl::Image2D>(
+      input_image_width, input_image_height, x_image_data.data());
+
+  LOG(INFO) << "prepare kernel ready";
+
+  LOG(INFO) << "mutable output ...";
+  CLImageConverterDefault default_converter;
+  DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = output.mutable_data<float, cl::Image2D>(out_image_shape[0],
+                                                            out_image_shape[1]);
+  VLOG(4) << "out_dims= " << output_dim;
+
+  LOG(INFO) << "kernel context ...";
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  std::unique_ptr<KernelContext> reshape_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(reshape_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(reshape_context));
+
+  LOG(INFO) << "kernel launch ...";
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_image);
+
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  float* out_image_data = new float[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              output.data<float, cl::Image2D>(),
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter.ImageToNCHW(
+      out_image_data, out_data, out_image_shape, output_dim);
+  // check output dims
+  for (int i = 0; i < output.dims().size(); i++) {
+    CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
+  }
+
+  // check output data
+  for (int i = 0; i < output.numel(); i++) {
+    EXPECT_NEAR(out_data[i], input_v_data[i], 1e-3);
+    if (abs(out_data[i] - input_v_data[i]) > 1e-3) {
+      LOG(INFO) << "error idx:" << i;
+    }
+  }
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(reshape, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(reshape2, kOpenCL, kFloat, kImageDefault, image2d);