[cherry-pick][OPENCL] remove conv redundant's for opencl kernel. test… (#3938)

* [cherry-pick][OPENCL] remove conv redundant's for opencl kernel. test=develop Co-authored-by: N xiebaiyuan <xiebaiyuan@qq.com>

[cherry-pick][OPENCL] remove conv redundant's for opencl kernel. test… (#3938)
* [cherry-pick][OPENCL] remove conv redundant's for opencl kernel. test=develop Co-authored-by: N xiebaiyuan <xiebaiyuan@qq.com>
273b1fa2 · ysh329 · GitHub · 594175af · 273b1fa2 · 273b1fa2
138 changed file
--- a/docs/develop_guides/add_operation.md
+++ b/docs/develop_guides/add_operation.md
@@ -27,6 +27,28 @@
        bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
        void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
        std::string DebugString() const override { return "argmax"; }
+
+    #ifdef LITE_WITH_PROFILE
+        void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+            auto input_dims = param_.X->dims();
+            auto output_dims = param_.Out->dims();
+            ch->input_shape = ch->DimToStr(input_dims);
+            ch->output_shape = ch->DimToStr(output_dims);
+            ch->remark = "axis" + std::to_string(param_.Axis);
+
+            auto axis = param_.Axis;
+            if (axis < 0) {
+                axis += input_dims.size();
+            }
+            int max_num = 1;
+            for (int64_t i = axis + 1; i < input_dims.size(); i++)
+                max_num *= input_dims[i];
+            float gops = 1.0f;
+            for (int i = 1; i <= max_num; i++) gops *= i;
+            ch->macs = gops * output_dims.production();
+        }
+    #endif
+
    private:
        mutable ArgmaxParam param_;
    };
@@ -85,6 +107,13 @@
        using param_t = operators::ArgmaxParam;
        void Run() override;
        virtual ~ArgmaxCompute() = default;
+    #ifdef LITE_WITH_PROFILE
+        virtual void SetProfileRuntimeKernelInfo(
+            paddle::lite::profile::OpCharacter* ch) {
+            ch->kernel_func_name = kernel_func_name_;
+        }
+        std::string kernel_func_name_{"NotImplForArgmax"};
+    #endif
    };
    ```
 - 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.cc文件，主要实现Run函数。`Run()`函数调用paddlelite/lite/bachends/arm/math/argmax.h中的`argmax_func()`函数，根据输入计算输出。最后在argmax_compute.cc文件中，我们绑定argmax的输入输出（为tensor的输入参数都需要绑定），代码如下：
@@ -95,6 +124,9 @@
        lite::Tensor* output = param.Out;
        int axis = param.Axis;
        lite::arm::math::argmax_func(input, axis, output);
+    #ifdef LITE_WITH_PROFILE
+        kernel_func_name_ = "argmax_func";
+    #endif
        return;
    }


--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <gflags/gflags.h>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -25,6 +24,7 @@
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
+#include <gflags/gflags.h>

 using paddle::lite::profile::Timer;

@@ -34,6 +34,10 @@ DEFINE_string(input_shape,
 DEFINE_bool(use_optimize_nb,
            false,
            "optimized & naive buffer model for mobile devices");
+DEFINE_string(backend,
+              "arm_cpu",
+              "choose backend for valid_places: arm_cpu | opencl. Compile "
+              "OpenCL version if you choose opencl");
 DEFINE_string(arg_name, "", "the arg name");

 namespace paddle {
@@ -49,9 +53,19 @@ void OutputOptModel(const std::string& load_model_dir,
                           Place{TARGET(kX86), PRECISION(kInt64)},
                           Place{TARGET(kHost), PRECISION(kFloat)}});
 #else
-  config.set_valid_places({
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
+  if (FLAGS_backend == "opencl") {
+    config.set_valid_places({
+        Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+        TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
+    });
+  } else {  // arm_cpu
+    config.set_valid_places({
+        Place{TARGET(kARM), PRECISION(kFloat)},
+    });
+  }
 #endif
  auto predictor = lite_api::CreatePaddlePredictor(config);

@@ -117,16 +131,40 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
            << ", min time: " << ti.LapTimes().Min() << " ms"
            << ", max time: " << ti.LapTimes().Max() << " ms.";

-  auto output = predictor->GetOutput(0);
-  auto out = output->data<float>();
-  LOG(INFO) << "out " << out[0];
-  LOG(INFO) << "out " << out[1];
-  auto output_shape = output->shape();
-  int output_num = 1;
-  for (int i = 0; i < output_shape.size(); ++i) {
-    output_num *= output_shape[i];
+  // output summary
+  size_t output_tensor_num = predictor->GetOutputNames().size();
+  LOG(INFO) << "output tensor num:" << output_tensor_num;
+
+  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
+    auto output_tensor = predictor->GetOutput(tidx);
+    LOG(INFO) << "============= output tensor " << tidx << " =============";
+    auto tensor_shape = output_tensor->shape();
+    std::string tensor_shape_str{""};
+    int output_tensor_numel = 1;
+    for (int i = 0; i < tensor_shape.size(); ++i) {
+      output_tensor_numel *= tensor_shape[i];
+      tensor_shape_str += std::to_string(tensor_shape[i]);
+      tensor_shape_str += (i < tensor_shape.size() - 1) ? "x" : "";
+    }
+    auto out_data = output_tensor->data<float>();
+    auto out_mean =
+        paddle::lite::compute_mean<float>(out_data, output_tensor_numel);
+    auto out_std_dev = paddle::lite::compute_standard_deviation<float>(
+        out_data, output_tensor_numel, true, out_mean);
+
+    VLOG(0) << "output tensor " << tidx << " dims:" << tensor_shape_str;
+    VLOG(0) << "output tensor " << tidx
+            << " elements num:" << output_tensor_numel;
+    VLOG(0) << "output tensor " << tidx
+            << " standard deviation:" << out_std_dev;
+    VLOG(0) << "output tensor " << tidx << " mean value:" << out_mean << "\n";
+
+    // print result
+    for (int i = 0; i < output_tensor_numel; ++i) {
+      VLOG(2) << "output_tensor->data<float>()[" << i
+              << "]:" << output_tensor->data<float>()[i];
+    }
  }
-  LOG(INFO) << "output_num: " << output_num;

  // please turn off memory_optimize_pass to use this feature.
  if (FLAGS_arg_name != "") {
@@ -162,6 +200,7 @@ int main(int argc, char** argv) {
              << "--model_dir /path/to/your/model";
    exit(0);
  }
+
  std::string save_optimized_model_dir = "";
  if (FLAGS_use_optimize_nb) {
    save_optimized_model_dir = FLAGS_model_dir;

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -26,6 +26,7 @@ USE_MIR_PASS(argument_type_display_pass);
 USE_MIR_PASS(runtime_context_assign_pass);
 USE_MIR_PASS(graph_visualize_pass);

+USE_MIR_PASS(remove_tf_redundant_ops_pass);
 USE_MIR_PASS(lite_conv_bn_fuse_pass);
 USE_MIR_PASS(lite_fc_fuse_pass);
 USE_MIR_PASS(lite_shuffle_channel_fuse_pass);

--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -1254,6 +1254,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_div<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    *dout = *dinx / *diny;
+    dout++;
+    dinx++;
+    diny++;
+  }
+}
+
 template <>
 void elementwise_div<float>(const float* dinx,
                            const float* diny,
@@ -1306,6 +1319,28 @@ void elementwise_div<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_div_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* din_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int p = 0; p < num; p++) {
+        *dout_ptr = *din_ptr / diny_data;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_div_broadcast<float>(const float* dinx,
                                      const float* diny,

--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -119,19 +119,13 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
  }
 }

-cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
+cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size,
                                         size_t max_work_size,
                                         int divisor) {
  int preferred_lws = 0;
-#if 1
  auto gws0 = global_work_size[0];
  auto gws1 = global_work_size[1];
  auto gws2 = global_work_size[2];
-#else
-  auto gws2 = global_work_size[0];
-  auto gws1 = global_work_size[1];
-  auto gws0 = global_work_size[2];
-#endif
  if (divisor > 1) {
    max_work_size /= divisor;
  }
@@ -147,15 +141,40 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
  }
-#if 1
  return cl::NDRange{static_cast<size_t>(gws0),
                     static_cast<size_t>(gws1),
                     static_cast<size_t>(gws2)};
-#else
+}
+
+cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
+                                                size_t max_work_size,
+                                                int divisor) {
+  int preferred_lws = 0;
+  auto gws2 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws0 = global_work_size[2];
+  if (divisor > 1) {
+    max_work_size /= divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (gws1 > max_work_size && max_work_size > 0) {
+    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
+  }
+  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
+    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
+  }
+  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
+    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
+  }
  return cl::NDRange{static_cast<size_t>(gws2),
                     static_cast<size_t>(gws1),
                     static_cast<size_t>(gws0)};
-#endif
+}
+
+bool CLContext::IsArmMali() {
+  return CLRuntime::Global()->GetGpuType() == GpuType::ARM_MALI;
 }

 cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,

--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -63,11 +63,14 @@ class CLContext {

  cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);

-  cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
+  cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size,
                                size_t max_work_size,
                                int divitor = 2);
-  //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
-  //                                   size_t max_work_size);
+
+  cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
+                                       size_t max_work_size,
+                                       int divitor = 2);
+  bool IsArmMali();

 private:
  std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;

--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
@@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt(
    __private const int global_size_dim2,
    __read_only image2d_t input_image,
    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,
@@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple(
    __private const int global_size_dim2,
    __read_only image2d_t input_image,
    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,
                         __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                         __read_only image2d_t bias,
-#endif
                         __write_only image2d_t output_image,
                         __private const int stride,
                         __private const int offset,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
-#endif
                             __write_only image2d_t output_image,
                             __private const int stride,
                             __private const int pad,
@@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
                                     __private const int item_h,
                                     __read_only image2d_t input_image,
                                     __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                     __read_only image2d_t bias,
-#endif
                                     __write_only image2d_t output_image,
                                     __private const int stride,
                                     __private const int pad,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,
                         __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                         __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                         __read_only image2d_t new_scale,
                         __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch,
                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
-#endif
                             __write_only image2d_t output_image,
                             __private const int stride,
                             __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
                                     __private const int item_h,
                                     __read_only image2d_t input_image,
                                     __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                     __read_only image2d_t bias,
-#endif
                                     __write_only image2d_t output_image,
                                     __private const int stride,
                                     __private const int pad,
@@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
                   (int2)(out_w_base_id + out_w_id4, item_h_id),
                   output[4]);
  }
-}
\ No newline at end of file
+}
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,
                         __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                         __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                         __read_only image2d_t new_scale,
                         __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch,
                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
-#endif
                             __write_only image2d_t output_image,
                             __private const int stride,
                             __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
                                     __private const int item_h,
                                     __read_only image2d_t input_image,
                                     __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                     __read_only image2d_t bias,
-#endif
                                     __write_only image2d_t output_image,
                                     __private const int stride,
                                     __private const int pad,
@@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
                   (int2)(out_w_base_id + out_w_id4, item_h_id),
                   output[4]);
  }
-}
\ No newline at end of file
+}
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
@@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
                           __private const int global_size_dim2,
                           __read_only image2d_t input,
                           __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                           __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                           __read_only image2d_t new_scale,
                           __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3(
    __private const int global_size_dim2,
    __read_only image2d_t input,
    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
-#endif
    __write_only image2d_t output_image,
    __private const int stride,
    __private const int offset,
@@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
                                 __private const int ou_nh,
                                 __read_only image2d_t input,
                                 __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                 __read_only image2d_t bias,
-#endif
                                 __write_only image2d_t output_image,
                                 __private const int stride,
                                 __private const int pad,

--- a/lite/backends/opencl/cl_kernel/image/expand_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/expand_kernel.cl
+#include <cl_common.h>
+
+__kernel void expend_c1(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = 0;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_w, in_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  in.y = 0;
+  in.z = 0;
+  in.w = 0;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
+
+__kernel void expend_c2(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = 0;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  in.z = 0;
+  in.w = 0;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
+__kernel void expend_c3(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = 0;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  in.w = 0;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
+__kernel void expend_c4(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = 0;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
+
+__kernel void expend_cn(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = out_c;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
\ No newline at end of file
--- a/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl
@@ -63,7 +63,10 @@ __kernel void grid_sampler(__read_only image2d_t input,
  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
      input3 = (CL_DTYPE4)(0.0);
  }
-  CL_DTYPE4 out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
+  CL_DTYPE4 out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) +
+                      input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) + 
+		      input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) +
+		      input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, outpoints, out_val);
 
  // y
@@ -97,7 +100,10 @@ __kernel void grid_sampler(__read_only image2d_t input,
      input3 = (CL_DTYPE4)(0.0);
  }

-  out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
+  out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) +
+            input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) +
+            input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) +
+            input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 1), out_val);

  // z
@@ -130,7 +136,10 @@ __kernel void grid_sampler(__read_only image2d_t input,
  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
      input3 = (CL_DTYPE4)(0.0);
  }
-  out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
+  out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) +
+            input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) +
+            input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) +
+            input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 2), out_val);

  // w
@@ -163,6 +172,9 @@ __kernel void grid_sampler(__read_only image2d_t input,
  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
      input3 = (CL_DTYPE4)(0.0);
  }
-  out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
+  out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) +
+            input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) + 
+            input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) +
+            input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 3), out_val);
 }
--- a/lite/backends/opencl/cl_kernel/image/pixel_shuffle_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/pixel_shuffle_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cl_common.h>
+__kernel void pixel_shuffle(__read_only image2d_t input_image,
+                            __write_only image2d_t output_image,
+                            __private const int in_N,
+                            __private const int in_C,
+                            __private const int in_H,
+                            __private const int in_W,
+                            __private const int out_N,
+                            __private const int out_C,
+                            __private const int out_H,
+                            __private const int out_W,
+                            __private const int upscale_factor) {
+  const int out_c4 = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int out_h = out_nh % out_H;
+  int out_n = out_nh / out_H;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int in_h = out_h / upscale_factor;
+  int in_w = out_w / upscale_factor;
+  int in_nh = out_n * in_H + in_h;
+
+  CL_DTYPE4 res;
+  int out_c;
+  int in_c;
+  CL_DTYPE4 in;
+  int2 in_pos;
+
+  out_c = out_c4 * 4 + 0;
+  in_c = out_c * upscale_factor * upscale_factor +
+         (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
+  in_pos.x = (in_c / 4) * in_W + in_w;
+  in_pos.y = in_nh;
+  in = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, in_pos);
+  if (in_c % 4 == 0) {
+    res.x = in.x;
+  } else if (in_c % 4 == 1) {
+    res.x = in.y;
+  } else if (in_c % 4 == 2) {
+    res.x = in.z;
+  } else if (in_c % 4 == 3) {
+    res.x = in.w;
+  }
+
+  out_c = out_c4 * 4 + 1;
+  in_c = out_c * upscale_factor * upscale_factor +
+         (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
+  in_pos.x = (in_c / 4) * in_W + in_w;
+  in_pos.y = in_nh;
+  in = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, in_pos);
+  if (in_c % 4 == 0) {
+    res.y = in.x;
+  } else if (in_c % 4 == 1) {
+    res.y = in.y;
+  } else if (in_c % 4 == 2) {
+    res.y = in.z;
+  } else if (in_c % 4 == 3) {
+    res.y = in.w;
+  }
+
+  out_c = out_c4 * 4 + 2;
+  in_c = out_c * upscale_factor * upscale_factor +
+         (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
+  in_pos.x = (in_c / 4) * in_W + in_w;
+  in_pos.y = in_nh;
+  in = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, in_pos);
+  if (in_c % 4 == 0) {
+    res.z = in.x;
+  } else if (in_c % 4 == 1) {
+    res.z = in.y;
+  } else if (in_c % 4 == 2) {
+    res.z = in.z;
+  } else if (in_c % 4 == 3) {
+    res.z = in.w;
+  }
+
+  out_c = out_c4 * 4 + 3;
+  in_c = out_c * upscale_factor * upscale_factor +
+         (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
+  in_pos.x = (in_c / 4) * in_W + in_w;
+  in_pos.y = in_nh;
+  in = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, in_pos);
+  if (in_c % 4 == 0) {
+    res.w = in.x;
+  } else if (in_c % 4 == 1) {
+    res.w = in.y;
+  } else if (in_c % 4 == 2) {
+    res.w = in.z;
+  } else if (in_c % 4 == 3) {
+    res.w = in.w;
+  }
+
+  int2 out_pos;
+  out_pos.x = out_c4 * out_W + out_w;
+  out_pos.y = out_nh;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, out_pos, res);
+}
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -191,6 +191,9 @@ bool CLRuntime::InitializeDevice() {
    }
    return t_str;
  };
+  const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
+  LOG(INFO) << "device_version:" << device_version;
+
  LOG(INFO) << "device_type:" << device_type_to_str(device_type);
  device_info_["CL_DEVICE_TYPE"] = device_type;

@@ -317,6 +320,8 @@ std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
  return device_info_;
 }

+GpuType& CLRuntime::GetGpuType() { return gpu_type_; }
+
 void CLRuntime::GetAdrenoContextProperties(
    std::vector<cl_context_properties>* properties,
    GPUPerfMode gpu_perf_mode,
@@ -365,5 +370,26 @@ void CLRuntime::GetAdrenoContextProperties(
  properties->push_back(0);
 }

+double CLRuntime::GetCommandTime(const cl::Event& event) {
+  command_queue().finish();
+  auto start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+  auto stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+  return (stop_nanos - start_nanos) / 1000000.0;
+}
+
+double CLRuntime::GetQueuedTime(const cl::Event& event) {
+  command_queue().finish();
+  return (event.getProfilingInfo<CL_PROFILING_COMMAND_START>() -
+          event.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>()) /
+         1000000.0;
+}
+
+double CLRuntime::GetSubmitTime(const cl::Event& event) {
+  command_queue().finish();
+  return (event.getProfilingInfo<CL_PROFILING_COMMAND_START>() -
+          event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()) /
+         1000000.0;
+}
+
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -93,6 +93,14 @@ class CLRuntime {

  std::map<std::string, size_t>& GetDeviceInfo();

+  GpuType& GetGpuType();
+
+  double GetCommandTime(const cl::Event& event);
+
+  double GetQueuedTime(const cl::Event& event);
+
+  double GetSubmitTime(const cl::Event& event);
+
 private:
  CLRuntime() { Init(); }


--- a/lite/backends/opencl/cl_utility.h
+++ b/lite/backends/opencl/cl_utility.h
@@ -45,5 +45,18 @@ const char* opencl_error_to_str(cl_int error);
 #else
 #define CL_CHECK_FATAL(err_code__)
 #endif
+
+#ifdef LITE_WITH_PROFILE
+#define EnqueueNDRangeKernel(                                      \
+    context, kernel, gws_offset, gws, lws, event_wait_list, event) \
+  context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(    \
+      kernel, gws_offset, gws, lws, event_wait_list, &event)
+#else
+#define EnqueueNDRangeKernel(                                      \
+    context, kernel, gws_offset, gws, lws, event_wait_list, event) \
+  context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(    \
+      kernel, gws_offset, gws, lws, event_wait_list, nullptr)
+#endif
+
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -62,6 +62,18 @@ class KernelBase {
    profiler_ = profiler;
    profile_id_ = id;
  }
+
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = std::string("NotImpl");
+#ifdef LITE_WITH_ARM
+    ch->cl_event = event_;
+#endif
+  }
+
+  virtual void SetIsKernelTest(bool is_kernel_test) {
+    is_kernel_test_ = is_kernel_test;
+  }
 #endif

  void Launch() {
@@ -86,14 +98,24 @@ class KernelBase {
 #if defined(LITE_WITH_MLU)
    WorkSpace::Global_MLU().AllocReset();
 #endif
+
 #ifdef LITE_WITH_PROFILE
-    profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
-    profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
+    if (!is_kernel_test_) {
+      profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
+      profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
+    }
+
    Run();
-#ifdef LITE_WITH_OPENCL
-    CLRuntime::Global()->command_queue().finish();
-#endif
-    profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
+
+    if (is_first_epoch_for_profiler_ && (!is_kernel_test_)) {
+      SetProfileRuntimeKernelInfo(profiler_->GetOpCharacter(profile_id_));
+      is_first_epoch_for_profiler_ = false;
+    }
+
+    if (!is_kernel_test_) {
+      profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
+    }
+
 #else
    Run();
 #endif
@@ -185,6 +207,11 @@ class KernelBase {
 #ifdef LITE_WITH_PROFILE
  profile::Profiler* profiler_{nullptr};
  int profile_id_{-1};
+  bool is_first_epoch_for_profiler_{true};
+  bool is_kernel_test_{true};
+#ifdef LITE_WITH_OPENCL
+  cl::Event event_;
+#endif
 #endif
 };


--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -28,6 +28,7 @@ lite_cc_library(mir_passes
      elimination/identity_scale_eliminate_pass.cc
      elimination/identity_dropout_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
+      elimination/remove_tf_redundant_ops_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc
      type_target_cast_pass.cc

--- a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
+++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/elimination/remove_tf_redundant_ops_pass.h"
+#include <unordered_set>
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher.h"
+#include "lite/model_parser/cpp/var_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void RemoveTFRedundantOpsPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  RemoveSqueeze2Reshape2Pattern(graph);
+  RemoveReshape2Pattern(graph);
+}
+
+void RemoveTFRedundantOpsPass::RemoveReshape2Pattern(
+    const std::unique_ptr<SSAGraph>& graph) {
+  bool found = false;
+  Node* softmax_node{nullptr};
+  Node* reshape2_node{nullptr};
+  std::string reshape2_out_arg_name;
+  Node* fetch_node{nullptr};
+  std::string fetch_in_arg_name;
+  DDim softmax_out_dims;
+  DDim reshape2_out_dims;
+
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (op_node->AsStmt().picked_kernel().op_type() == "softmax") {
+      softmax_node = op_node;
+    } else if (op_node->AsStmt().picked_kernel().op_type() == "reshape2") {
+      reshape2_node = op_node;
+    } else if (op_node->AsStmt().picked_kernel().op_type() == "fetch") {
+      fetch_node = op_node;
+      fetch_in_arg_name = fetch_node->inlinks.front()->AsArg().name;
+    }
+  }
+
+  if (softmax_node == nullptr || reshape2_node == nullptr) {
+    return;
+  }
+
+  // Get out tensor dims of softmax, reshape2
+  auto* scope = softmax_node->AsStmt().op()->scope();
+  auto softmax_out_arg_name = softmax_node->outlinks.front()->AsArg().name;
+  auto softmax_out_tensor =
+      scope->FindVar(softmax_out_arg_name)->Get<lite::Tensor>();
+  softmax_out_dims = softmax_out_tensor.dims();
+
+  for (auto out_node : reshape2_node->outlinks) {
+    if (out_node->IsArg() && out_node->outlinks.size() != 0) {
+      reshape2_out_arg_name = reshape2_node->outlinks.front()->AsArg().name;
+      auto reshape2_out_tensor =
+          scope->FindVar(reshape2_out_arg_name)->Get<lite::Tensor>();
+      reshape2_out_dims = reshape2_out_tensor.dims();
+    }
+  }
+
+  VLOG(3) << "reshape2_out_dims:" << reshape2_out_dims;
+  VLOG(3) << "softmax_out_dims:" << softmax_out_dims;
+  VLOG(3) << "found:" << found;
+
+  if (softmax_out_dims == reshape2_out_dims &&
+      softmax_node->outlinks.front() == reshape2_node->inlinks.front() &&
+      reshape2_out_arg_name == fetch_in_arg_name) {
+    found = true;
+  }
+
+  if (found) {
+    // link out_arg to op
+    IR_NODE_LINK_TO(softmax_node->outlinks.front(), fetch_node);
+
+    // collect nodes to safe remove
+    std::unordered_set<const Node*> nodes_to_remove;
+    auto remove_inst_node_and_out_args_node = [&](Node* n) {
+      nodes_to_remove.insert(n);
+      for (auto& out : n->outlinks) {
+        nodes_to_remove.insert(out);
+      }
+    };
+
+    remove_inst_node_and_out_args_node(reshape2_node);
+    GraphSafeRemoveNodes(graph.get(), nodes_to_remove);
+    auto fetch_op_desc = fetch_node->AsStmt().mutable_op_info();
+    fetch_op_desc->SetInput("X",
+                            {softmax_node->outlinks.front()->AsArg().name});
+  }
+  VLOG(5) << "\n" << Visualize(graph.get());
+}
+
+void RemoveTFRedundantOpsPass::RemoveSqueeze2Reshape2Pattern(
+    const std::unique_ptr<SSAGraph>& graph) {
+  VLOG(5) << Visualize(graph.get());
+  bool found = false;
+
+  // find out_arg->squeeze2
+  // find out_arg_dims of out_arg
+  Node* out_arg_node{nullptr};
+  DDim out_arg_dims;
+  Node* squeeze2_node{nullptr};
+
+  // find squeeze2->reshape2
+  // find output dims of squeeze2 and reshape2 nodes
+  DDim squeeze2_out_dims;
+  Node* reshape2_node{nullptr};
+  Node* reshape2_out_node{nullptr};
+  DDim reshape2_out_dims;
+
+  // find next inst node of reshape2
+  Node* next_inst_node_of_reshape2_out{nullptr};
+
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    if (node->AsStmt().picked_kernel().op_type() != "squeeze2") continue;
+    auto* scope = node->AsStmt().op()->scope();
+
+    // find inlinks of squeeze2: out_arg_node
+    squeeze2_node = node;
+    auto squeeze2_inlinks = squeeze2_node->inlinks;
+    VLOG(5) << "squeeze2_inlinks.size():" << squeeze2_inlinks.size();
+    for (auto& in_link : squeeze2_inlinks) {
+      if (in_link->IsArg() && squeeze2_inlinks.size() == 1) {
+        out_arg_node = in_link;
+        auto* var = scope->FindVar(out_arg_node->AsArg().name);
+        out_arg_dims = var->Get<lite::Tensor>().dims();
+        VLOG(5) << "arg name:" << out_arg_node->AsArg().name
+                << " dims:" << out_arg_dims;
+      } else {
+        // found mutli-input links
+        continue;
+      }
+    }
+
+    // find squeeze2->reshape2 pattern
+    // and output dims of squeeze2, reshape2 nodes
+    auto squeeze2_outlinks = squeeze2_node->outlinks;
+    for (auto& squeeze2_out_link : squeeze2_outlinks) {
+      if (squeeze2_out_link->IsArg() &&
+          squeeze2_out_link->outlinks.size() != 0) {
+        auto* squeeze2_out_var =
+            scope->FindVar(squeeze2_out_link->AsArg().name);
+        squeeze2_out_dims = squeeze2_out_var->Get<lite::Tensor>().dims();
+
+        VLOG(5) << "squeeze2_out_arg.name:" << squeeze2_out_link->AsArg().name
+                << " squeeze2_out_dims:" << squeeze2_out_dims
+                << " squeeze2_out_link->outlinks.size():"
+                << squeeze2_out_link->outlinks.size();
+
+        for (auto& out2_link : squeeze2_out_link->outlinks) {
+          if (out2_link->IsStmt() &&
+              out2_link->AsStmt().picked_kernel().op_type() == "reshape2") {
+            reshape2_node = out2_link;
+            for (auto& reshape2_out_link : reshape2_node->outlinks) {
+              if (reshape2_out_link->IsArg() &&
+                  reshape2_out_link->outlinks.size() != 0) {
+                reshape2_out_node = reshape2_out_link;
+                auto* reshape2_out_var =
+                    scope->FindVar(reshape2_out_link->AsArg().name);
+                reshape2_out_dims =
+                    reshape2_out_var->Get<lite::Tensor>().dims();
+
+                VLOG(5) << "reshape2_out_node:" << reshape2_out_node
+                        << " reshape2_out_name:"
+                        << reshape2_out_link->AsArg().name
+                        << " reshape2_out_dims:" << reshape2_out_dims;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // find next inst node of reshape2
+    VLOG(5) << "reshape2_out_node->outlinks.size():"
+            << reshape2_out_node->outlinks.size()
+            << " reshape2_out_node->IsStmt():" << reshape2_out_node->IsStmt();
+    VLOG(5) << "reshape2_out_node->AsArg().name:"
+            << reshape2_out_node->AsArg().name;
+    if (reshape2_out_node != nullptr &&
+        reshape2_out_node->outlinks.size() == 1 &&
+        reshape2_out_node->outlinks.front()->IsStmt()) {
+      next_inst_node_of_reshape2_out = reshape2_out_node->outlinks.front();
+      found = true;
+      break;
+      VLOG(5)
+          << "next_inst_node_of_reshape2_out->picked_kernel().op_type():"
+          << next_inst_node_of_reshape2_out->AsStmt().picked_kernel().op_type();
+    }
+
+    VLOG(5) << "==============================";
+    VLOG(5) << "out_arg_dims:" << out_arg_dims;
+    VLOG(5) << "squeeze2_out_dims:" << squeeze2_out_dims;
+    VLOG(5) << "reshape2_out_dims:" << reshape2_out_dims;
+    VLOG(5) << "==============================";
+  }
+
+  // replace pattern
+  if (found && out_arg_dims[1] == squeeze2_out_dims[1] &&
+      out_arg_dims[1] == reshape2_out_dims[1] && out_arg_dims[1] == 1001 &&
+      out_arg_dims[2] == out_arg_dims[3] && out_arg_dims[2] == 1 &&
+      next_inst_node_of_reshape2_out->AsStmt().picked_kernel().op_type() ==
+          "softmax") {
+    // link out_arg to op
+    IR_NODE_LINK_TO(out_arg_node, next_inst_node_of_reshape2_out);
+
+    // collect nodes to safe remove
+    std::unordered_set<const Node*> nodes_to_remove;
+    auto remove_inst_node_and_out_args_node = [&](Node* n) {
+      nodes_to_remove.insert(n);
+      for (auto& out : n->outlinks) {
+        nodes_to_remove.insert(out);
+      }
+    };
+    remove_inst_node_and_out_args_node(squeeze2_node);
+    remove_inst_node_and_out_args_node(reshape2_node);
+    GraphSafeRemoveNodes(graph.get(), nodes_to_remove);
+    auto next_inst_op_desc =
+        next_inst_node_of_reshape2_out->AsStmt().mutable_op_info();
+    next_inst_op_desc->SetInput("X", {out_arg_node->AsArg().name});
+    VLOG(5) << Visualize(graph.get());
+  }
+  VLOG(5) << "replace pattern fininshed";
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(remove_tf_redundant_ops_pass,
+                  paddle::lite::mir::RemoveTFRedundantOpsPass)
+    .BindTargets({TARGET(kOpenCL), TARGET(kARM)});
--- a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h
+++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/tensor.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+/*
+ * mir::RemoveTFRedundantOpsPass remove reshape2->squeeze2 pattern
+ * and last reshape2 op for tensorflow mobilenetv1/v2.
+ */
+class RemoveTFRedundantOpsPass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+  void RemoveReshape2Pattern(const std::unique_ptr<SSAGraph>& graph);
+  void RemoveSqueeze2Reshape2Pattern(const std::unique_ptr<SSAGraph>& graph);
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/pattern_matcher.h
+++ b/lite/core/mir/pattern_matcher.h
@@ -20,6 +20,7 @@

 #include <memory>
 #include <numeric>
+#include <set>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>

--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -170,6 +170,9 @@ class VariablePlaceInferencePass : public DebugPass {
          // If is quantization, infer the Int8 type.
          if (type->precision() == PRECISION(kInt8)) {
            x_out->AsArg().type = type;
+          } else if (type->precision() == PRECISION(kFP16) &&
+                     type->target() != TARGET(kOpenCL)) {
+            x_out->AsArg().type = type;
          } else {
            PrecisionType tmp_ptype = x_out->AsArg().type->precision();
            x_out->AsArg().type = LiteType::GetTensorTy(

--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -73,6 +73,9 @@ class OpLite : public Registry {
  // Indicate whether the Op runs only once or not
  virtual bool run_once() const { return false; }
  std::string Type() { return op_type_; }
+#ifdef LITE_WITH_PROFILE
+  virtual void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {}
+#endif

  // Link the external execution environ to internal context.
  bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -92,7 +92,8 @@ class Optimizer {
           "bm_subgraph_pass",
           "apu_subgraph_pass",
           "rknpu_subgraph_pass",
-           "static_kernel_pick_pass",        // pick original kernel from graph
+           "static_kernel_pick_pass",  // pick original kernel from graph
+           "remove_tf_redundant_ops_pass",
           "variable_place_inference_pass",  // inference arg/var's
           // info(target/precision/layout/device)
           // using kernel info
@@ -158,6 +159,55 @@ class Optimizer {

  const lite::Scope* exec_scope() const { return exec_scope_; }

+  // Set shape(dims) infos of var descs to scope var.
+  //  developer can write pass using input / output tensor dims of op.
+  //
+  // Example: If you have node `Node* softmax_node`,
+  //   you can get dims of output tensor in passes:
+  //
+  //   auto* scope = softmax_node->AsStmt().op()->scope();
+  //   auto softmax_out_arg_name =
+  //             softmax_node->outlinks.front()->AsArg().name;
+  //   auto softmax_out_tensor =
+  //             scope->FindVar(softmax_out_arg_name)->Get<lite::Tensor>();
+  //   softmax_out_dims = softmax_out_tensor.dims();
+  void SetVarDescShapeToScopeVar() {
+    auto dims_to_str_func = [](std::vector<int64_t> shape) -> std::string {
+      std::string str_res;
+      for (size_t i = 0; i < shape.size(); ++i) {
+        str_res += std::to_string(shape[i]);
+        if (i != shape.size() - 1) {
+          str_res += "x";
+        }
+      }
+      return str_res;
+    };
+
+    auto* program_desc = program_->program_desc();
+    VLOG(5) << "program_desc->BlocksSize():" << program_desc->BlocksSize();
+    auto blocks_desc = program_desc->GetBlocks();
+    for (size_t bidx = 0; bidx < blocks_desc.size(); ++bidx) {
+      auto block_desc = blocks_desc[bidx];
+      auto vars_desc = block_desc.GetVars();
+      for (size_t vidx = 0; vidx < vars_desc.size(); ++vidx) {
+        auto var_desc = vars_desc[vidx];
+        VLOG(5) << var_desc.Name() << " "
+                << dims_to_str_func(var_desc.GetShape());
+        if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
+        auto* var = program_->exec_scope()->FindVar(var_desc.Name());
+        auto tensor = var->GetMutable<lite::Tensor>();
+        if (tensor->dims().size() == 0 && var_desc.GetShape().size() != 0) {
+          VLOG(5) << "var_desc.Name():" << var_desc.Name()
+                  << " shape:" << dims_to_str_func(var_desc.GetShape());
+          tensor->Resize(var_desc.GetShape());
+        }
+        VLOG(5) << "var_desc.Name():" << var_desc.Name()
+                << " shape:" << dims_to_str_func(var_desc.GetShape())
+                << " tensor:" << tensor->dims();
+      }
+    }
+  }
+
  // Generate a new program based on the mir graph.
  std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
    auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
@@ -198,6 +248,7 @@ class Optimizer {

  // Specify the passes and run them.
  void RunPasses(const std::vector<std::string>& passes) {
+    SetVarDescShapeToScopeVar();
    for (auto& x : passes) {
      LOG(INFO) << "== Running pass: " << x;
      mir::Pass* pass = mir::PassManager::Global().LookUp(x);

--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "lite/core/profile/profiler.h"
+#include <iomanip>
 #include <map>
 #include <string>
 #include <utility>
@@ -23,10 +24,11 @@ namespace profile {

 namespace {
 auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
-  return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
-         (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
+  // compare for unique key of map
+  return (c1.kernel_name + c1.kernel_func_name <
+          c2.kernel_name + c2.kernel_func_name);
 };
-}
+}  // namespace

 std::map<Type, std::string> TypeStr{
    {Type::kUnk, "Unknown"},
@@ -64,22 +66,62 @@ int Profiler::NewTimer(const OpCharacter& ch) {
  return units_.size() - 1;
 }

+OpCharacter* Profiler::GetOpCharacter(const size_t index) {
+  CHECK_LT(index, units_.size())
+      << "The timer index in the profiler is out of range.";
+  return &units_[index].Character();
+}
+
 void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) {
  CHECK_LT(index, units_.size())
      << "The timer index in the profiler is out of range.";
  units_[index].Timer(type)->Start(ctx);
 }

-float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
+void Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
  CHECK_LT(index, units_.size())
      << "The timer index in the profiler is out of range.";
-  return units_[index].Timer(type)->Stop(ctx);
+  units_[index].Timer(type)->Stop(ctx);
+#ifdef LITE_WITH_OPENCL
+  units_[index].Timer(type)->CLStop(units_[index].character.op_type,
+                                    units_[index].character.io_duration,
+                                    units_[index].character.cl_event);
+#endif
+}
+
+int Profiler::GetKernelFuncCalledTimes(const std::string& op_type,
+                                       const std::string& kernel_attr,
+                                       const std::string& kernel_func_name) {
+  int count = 0;
+  for (size_t i = 0; i < units_.size(); ++i) {
+    if ((units_[i].character.kernel_func_name == kernel_func_name) &&
+        (units_[i].character.kernel_attr == kernel_attr) &&
+        (units_[i].character.op_type == op_type)) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+float Profiler::GetKernelFuncSummaryGOPs(const std::string& op_type,
+                                         const std::string& kernel_attr,
+                                         const std::string& kernel_func_name) {
+  float GOPs = 0;
+  for (size_t i = 0; i < units_.size(); ++i) {
+    if ((units_[i].character.kernel_func_name == kernel_func_name) &&
+        (units_[i].character.kernel_attr == kernel_attr) &&
+        (units_[i].character.op_type == op_type)) {
+      GOPs += units_[i].character.macs;
+    }
+  }
+  return GOPs * 1e-9f;
 }

 std::string Profiler::Summary(Type type, bool concise, size_t w) {
  using std::setw;
  using std::left;
  using std::fixed;
+  using std::setprecision;
  STL::stringstream ss;
  std::string title;
  // Title.
@@ -94,14 +136,41 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
       << " Profiler Summary: " << name_ << ", Exclude " << w
       << " warm-ups =====" << std::endl;
  }
-  ss << setw(25) << left << "Operator Type"
-     << " " << setw(40) << left << "Kernel Name"
-     << " " << setw(12) << left << "Remark"
-     << " " << setw(12) << left << "Avg (ms)"
-     << " " << setw(12) << left << "Min (ms)"
-     << " " << setw(12) << left << "Max (ms)"
-     << " " << setw(12) << left << "Last (ms)"
-     << " " << setw(12) << left << "Percent (%)" << std::endl;
+  ss << setw(20) << left << "OperatorType"
+     << " " << setw(30) << left << "KerneAttr(Place)"
+     << " " << setw(24) << left << "KernelFuncName";
+  if (!concise) {
+    ss << " " << setw(26) << left << "Remark"
+       << " " << setw(15) << left << "InDim"
+       << " " << setw(15) << left << "FilterDim"
+       << " " << setw(15) << left << "OutDim";
+  }
+  ss << " " << setw(7) << left << "Avg(ms)"
+     << " " << setw(7) << left << "Min(ms)"
+     << " " << setw(7) << left << "Max(ms)";
+  if (!concise) {
+    ss << " " << setw(7) << left << "Last(ms)";
+  }
+  ss << " " << setw(7) << left << "Avg(%)"
+     << " " << setw(7) << left << "GOPs";
+  if (!concise) {
+    ss << " " << setw(7) << left << "GOPS";
+  }
+  if (concise) {
+    ss << " " << setw(11) << left << "CalledTimes";
+  }
+#ifdef LITE_WITH_OPENCL
+  ss << " " << setw(9) << left << "clAvg(ms)"
+     << " " << setw(9) << left << "clMin(ms)"
+     << " " << setw(9) << left << "clMax(ms)"
+     << " " << setw(9) << left << "clAvg(%)";
+  if (!concise) {
+    ss << " " << setw(12) << left << "GlobalWorkSize"
+       << " " << setw(12) << left << "LocalWorkSize";
+  }
+#endif
+  ss << std::endl;
+
  // Profile information.
  if (concise) {
    std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
@@ -111,32 +180,75 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
        ch->second.avg += unit.Timer(type)->LapTimes().Avg(w);
        ch->second.min += unit.Timer(type)->LapTimes().Min(w);
        ch->second.max += unit.Timer(type)->LapTimes().Max(w);
+#ifdef LITE_WITH_OPENCL
+        ch->second.cl_avg += unit.Timer(type)->CLLapTimes().Avg(w);
+        ch->second.cl_min += unit.Timer(type)->CLLapTimes().Min(w);
+        ch->second.cl_max += unit.Timer(type)->CLLapTimes().Max(w);
+#endif
      } else {
-        TimeInfo info({unit.Timer(type)->LapTimes().Avg(w),
-                       unit.Timer(type)->LapTimes().Min(w),
-                       unit.Timer(type)->LapTimes().Max(w)});
+        TimeInfo info;
+        info.avg = unit.Timer(type)->LapTimes().Avg(w);
+        info.min = unit.Timer(type)->LapTimes().Min(w);
+        info.max = unit.Timer(type)->LapTimes().Max(w);
+#ifdef LITE_WITH_OPENCL
+        info.cl_avg = unit.Timer(type)->CLLapTimes().Avg(w);
+        info.cl_min = unit.Timer(type)->CLLapTimes().Min(w);
+        info.cl_max = unit.Timer(type)->CLLapTimes().Max(w);
+#endif
        summary.insert({unit.Character(), info});
      }
    }
+
    // compute total time
    float total = 0.0;
    for (const auto& item : summary) {
      total += item.second.avg;
    }
+#ifdef LITE_WITH_OPENCL
+    float cl_total = 0.0;
+    for (const auto& item : summary) {
+      cl_total += item.second.cl_avg;
+    }
+#endif
+
    for (const auto& item : summary) {
      float percent = 0;
      if (total > 0) {
        percent = 100 * (item.second.avg / total);
      }
      // clang-format off
-      ss << setw(25) << left << fixed << item.first.op_type             \
-         << " " << setw(40) << left << fixed << item.first.kernel_name  \
-         << " " << setw(12) << left << fixed << item.first.remark       \
-         << " " << setw(12) << left << fixed << item.second.avg         \
-         << " " << setw(12) << left << fixed << item.second.min         \
-         << " " << setw(12) << left << fixed << item.second.max         \
-         << " " << setw(12) << left << fixed << percent << "%"          \
-         << " " << std::endl;
+      ss << setw(20) << left << fixed << item.first.op_type
+         << " " << setw(30) << left << fixed << item.first.kernel_attr
+         << " " << setw(24) << left << fixed << item.first.kernel_func_name
+         << " " << setw(7) << left << fixed << setprecision(3)
+         << item.second.avg
+         << " " << setw(7) << left << fixed << setprecision(3)
+         << item.second.min
+         << " " << setw(7) << left << fixed << setprecision(3)
+         << item.second.max
+         << " " << setprecision(2) << percent << "%   "
+         << " " << setw(7) << left << fixed << setprecision(3)
+         << GetKernelFuncSummaryGOPs(item.first.op_type,
+                                     item.first.kernel_attr,
+                                     item.first.kernel_func_name)
+         << " " << setw(11) << left << fixed
+         << GetKernelFuncCalledTimes(item.first.op_type,
+                                     item.first.kernel_attr,
+                                     item.first.kernel_func_name);
+#ifdef LITE_WITH_OPENCL
+      float cl_percent = 0;
+      if (cl_total > 0) {
+        cl_percent = 100 * (item.second.cl_avg / cl_total);
+      }
+      ss << " " << setw(9) << left << fixed << setprecision(3)
+         << item.second.cl_avg
+         << " " << setw(9) << left << fixed << setprecision(3)
+         << item.second.cl_min
+         << " " << setw(9) << left << fixed << setprecision(3)
+         << item.second.cl_max
+         << " " << left << fixed << setprecision(2) << cl_percent << "%   ";
+#endif
+      ss << std::endl;
      // clang-format on
    }
  } else {
@@ -145,6 +257,13 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
      const auto& times = unit.Timer(type)->LapTimes();
      total += times.Avg(w);
    }
+#ifdef LITE_WITH_OPENCL
+    float cl_total = 0.0;
+    for (auto& unit : units_) {
+      const auto& cl_times = unit.Timer(type)->CLLapTimes();
+      cl_total += cl_times.Avg(w);
+    }
+#endif
    for (auto& unit : units_) {
      const auto& times = unit.Timer(type)->LapTimes();
      float run = times.Avg(w);
@@ -152,17 +271,46 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
      if (total > 0) {
        percent = 100 * (run / total);
      }
+
+#ifdef LITE_WITH_OPENCL
+      const auto& cl_times = unit.Timer(type)->CLLapTimes();
+      float cl_run = cl_times.Avg(w);
+      float cl_percent = 0;
+      if (cl_total > 0) {
+        cl_percent = 100 * (cl_run / cl_total);
+      }
+#endif
+
      // clang-format off
-      ss << setw(25) << left << fixed << unit.Character().op_type            \
-         << " " << setw(40) << left << fixed << unit.Character().kernel_name \
-         << " " << setw(12) << left << fixed << unit.Character().remark      \
-         << " " << setw(12) << left << fixed << times.Avg(w)                 \
-         << " " << setw(12) << left << fixed << times.Min(w)                 \
-         << " " << setw(12) << left << fixed << times.Max(w)                 \
-         << " " << setw(12) << left << fixed << times.Last(w)                \
-          << " " << setw(12) << left << fixed << percent << "%"              \
-         << std::endl;
-      // clang-format on
+      ss << setw(20) << left << fixed << unit.Character().op_type
+         << " " << setw(30) << left << fixed << unit.Character().kernel_attr
+         << " " << setw(24) << left << fixed
+         << unit.Character().kernel_func_name
+         << " " << setw(26) << left << fixed << unit.Character().remark
+         << " " << setw(15) << left << fixed << unit.Character().input_shape
+         << " " << setw(15) << left << fixed << unit.Character().filter_shape
+         << " " << setw(15) << left << fixed << unit.Character().output_shape
+         << " " << setw(7) << left << fixed << setprecision(3) << times.Avg(w)
+         << " " << setw(7) << left << fixed << setprecision(3) << times.Min(w)
+         << " " << setw(7) << left << fixed << setprecision(3) << times.Max(w)
+         << " " << setw(7) << left << fixed << setprecision(3) << times.Last(w)
+         << " " << left << setprecision(2) << percent << "%   "
+         << " " << setw(7) << left << fixed << setprecision(3)
+                << 1e-9f * unit.Character().macs
+         << " " << setw(7) << left << fixed << setprecision(2)
+                << 1e-6f * unit.Character().macs / times.Avg(w);
+// clang-format on
+#ifdef LITE_WITH_OPENCL
+      ss << " " << setw(9) << left << fixed << setprecision(3)
+         << cl_times.Avg(w) << " " << setw(9) << left << fixed
+         << setprecision(3) << cl_times.Min(w) << " " << setw(9) << left
+         << fixed << setprecision(3) << cl_times.Max(w) << " " << left
+         << setprecision(2) << cl_percent << "%   "
+         << " " << setw(12) << left << fixed
+         << unit.Character().global_work_size << " " << setw(12) << left
+         << fixed << unit.Character().local_work_size;
+#endif
+      ss << std::endl;
    }
  }
  return ss.str();

--- a/lite/core/profile/profiler.h
+++ b/lite/core/profile/profiler.h
@@ -18,6 +18,10 @@
 #include <string>
 #include <vector>
 #include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_include.h"
+#endif

 namespace paddle {
 namespace lite {
@@ -35,25 +39,83 @@ struct TimeInfo {
  float avg;
  float min;
  float max;
+#ifdef LITE_WITH_OPENCL
+  float cl_avg;
+  float cl_min;
+  float cl_max;
+#endif
 };

 struct OpCharacter {
  TargetType target;
+  void* op_lite{nullptr};
  std::string op_type{std::string("N/A")};
  std::string kernel_name{std::string("N/A")};
+  std::string kernel_attr{std::string("N/A")};
+  std::string kernel_func_name{std::string("N/A")};
  std::string remark{std::string("N/A")};
+
+  std::string input_shape{"N/A"};
+  std::string output_shape{"N/A"};
+  std::string filter_shape{"N/A"};
+
+  float macs{0};
+  float macs_ps{0};
+
+  float io_duration{0};
+
+#ifdef LITE_WITH_OPENCL
+  cl::Event cl_event{};
+  std::string global_work_size{"N/A"};
+  std::string local_work_size{"N/A"};
+
+  std::string NDRangeToStr(const cl::NDRange& range) {
+    std::string range_str{""};
+    const size_t range_size = 3;
+    for (size_t i = 0; i < range_size /*range.size()*/; ++i) {
+      LOG(INFO) << "range[" << i << "]:" << std::to_string(range[i]);
+      range_str += std::to_string(range[i]);
+      if (i != range_size - 1) {
+        range_str += ",";
+      }
+    }
+    return range_str;
+  }
+#else
+  void* cl_event{nullptr};
+#endif
+
+  std::string DimToStr(const paddle::lite::DDimLite& dim) {
+    if (!dim.size()) return "NotImpl";
+    std::string dim_str{""};
+    for (size_t i = 0; i < dim.size(); ++i) {
+      dim_str += std::to_string(dim[i]);
+      if (i != dim.size() - 1) {
+        dim_str += "x";
+      }
+    }
+    return dim_str;
+  }
+
+  std::string str() {
+    std::string str{""};
+    str += kernel_name + "/" + kernel_func_name + "/" + remark + "/" +
+           input_shape + "/" + filter_shape + "/" + output_shape;
+    return str;
+  }
 };

 class StatisUnit final {
 public:
  explicit StatisUnit(const OpCharacter& ch);
  lite::profile::Timer* Timer(Type type);
-  const OpCharacter& Character() const { return character; }
+  OpCharacter& Character() { return character; }
+
+  OpCharacter character;

 protected:
  std::unique_ptr<lite::profile::Timer> create_t;
  std::unique_ptr<lite::profile::Timer> dispatch_t;
-  OpCharacter character;
 };

 class Profiler final {
@@ -62,8 +124,15 @@ class Profiler final {
  explicit Profiler(const std::string& name) : name_(name) {}
  int NewTimer(const OpCharacter& ch);
  void StartTiming(Type type, const int index, KernelContext* ctx);
-  float StopTiming(Type type, const int index, KernelContext* ctx);
+  void StopTiming(Type type, const int index, KernelContext* ctx);
  std::string Summary(Type type, bool concise = true, size_t warm_up = 10);
+  int GetKernelFuncCalledTimes(const std::string& op_type,
+                               const std::string& kernel_attr,
+                               const std::string& kernel_func_name);
+  float GetKernelFuncSummaryGOPs(const std::string& op_type,
+                                 const std::string& kernel_attr,
+                                 const std::string& kernel_func_name);
+  OpCharacter* GetOpCharacter(const size_t index);

 private:
  std::string name_{std::string("N/A")};

--- a/lite/core/profile/timer.h
+++ b/lite/core/profile/timer.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <algorithm>
 #include <chrono>  // NOLINT
+#include <string>
 #include <vector>
 #ifdef LITE_WITH_CUDA
 #include "lite/backends/cuda/cuda_utils.h"
@@ -87,6 +88,22 @@ class Timer {
    this->laps_t_.Add(elapse_ms);
    return elapse_ms;
  }
+
+#ifdef LITE_WITH_OPENCL
+  float CLStop(const std::string& op_type, float io_duration, cl::Event event) {
+    float cl_kernel_elapse_ms = 0.0;
+    if (op_type != "io_copy") {
+      cl_kernel_elapse_ms =
+          CLRuntime::Global()->CLRuntime::GetCommandTime(event);
+    } else {
+      cl_kernel_elapse_ms = io_duration;
+    }
+    this->cl_laps_t_.Add(cl_kernel_elapse_ms);
+    return cl_kernel_elapse_ms;
+  }
+  const TimeList<float>& CLLapTimes() const { return cl_laps_t_; }
+#endif
+
  virtual void Start(KernelContext* ctx) { return Start(); }
  virtual float Stop(KernelContext* ctx) { return Stop(); }
  float AvgLapTimeMs() const { return laps_t_.Avg(); }
@@ -94,6 +111,9 @@ class Timer {

 protected:
  TimeList<float> laps_t_;
+#ifdef LITE_WITH_OPENCL
+  TimeList<float> cl_laps_t_;
+#endif

 private:
  std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -169,7 +169,7 @@ void RuntimeProgram::Run() {
 #endif  // LITE_WITH_PRECISION_PROFILE
  }
 #ifdef LITE_WITH_PROFILE
-  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
+  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1);
 #endif
 #ifdef LITE_WITH_PRECISION_PROFILE
  LOG(INFO) << "\n" << precision_profiler_summary;
@@ -299,6 +299,14 @@ void Instruction::Run() {
  op_->InferShape();
  kernel_->Launch();
  has_run_ = true;
+
+#ifdef LITE_WITH_PROFILE
+  if (first_epoch_for_profiler_) {
+    kernel_->SetIsKernelTest(false);
+    SetProfileRuntimeOpInfo(profiler_->GetOpCharacter(profile_id_));
+    first_epoch_for_profiler_ = false;
+  }
+#endif
 }

 STL::ostream& operator<<(STL::ostream& os, const Instruction& other) {

--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -23,6 +23,9 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/model_parser/cpp/program_desc.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif

 namespace paddle {
 namespace lite {
@@ -64,6 +67,8 @@ struct Program {
  lite::Scope* exec_scope() { return exec_scope_; }
  lite::Scope* scope() { return scope_.get(); }

+  cpp::ProgramDesc* program_desc() { return &desc_; }
+
  const std::unordered_map<std::string, PrecisionType>& var_data_type() const {
    return var_data_type_;
  }
@@ -125,13 +130,22 @@ struct Instruction {
    profiler_ = profiler;
    if (op_->Type() != "feed" && op_->Type() != "fetch") {
      profile::OpCharacter ch;
+      ch.op_lite = static_cast<void*>(const_cast<paddle::lite::OpLite*>(op()));
      ch.target = kernel()->target();
      ch.op_type = op_->Type();
      ch.kernel_name = kernel()->name();
+      ch.kernel_attr = kernel()->name().substr(ch.op_type.size() + 1,
+                                               kernel()->name().size());
+      // append `ch.kernel_func_name` in StopTiming
      profile_id_ = profiler->NewTimer(ch);
      kernel_->SetProfiler(profiler_, profile_id_);
    }
  }
+
+  void SetProfileRuntimeOpInfo(paddle::lite::profile::OpCharacter* ch) {
+    auto* op_lite = static_cast<paddle::lite::OpLite*>(ch->op_lite);
+    op_lite->GetOpRuntimeInfo(ch);
+  }
 #endif

 private:
@@ -144,6 +158,7 @@ struct Instruction {
 #ifdef LITE_WITH_PROFILE
  profile::Profiler* profiler_;
  int profile_id_{-1};
+  bool first_epoch_for_profiler_{true};
 #endif  // LITE_WITH_PROFILE
 };


--- a/lite/kernels/arm/argmax_compute.cc
+++ b/lite/kernels/arm/argmax_compute.cc
@@ -35,6 +35,9 @@ void ArgmaxCompute::Run() {
  }

  lite::arm::math::argmax_func(input, axis, output);
+#ifdef LITE_WITH_PROFILE
+  kernel_func_name_ = "argmax_func";
+#endif
  return;
 }


--- a/lite/kernels/arm/argmax_compute.h
+++ b/lite/kernels/arm/argmax_compute.h
@@ -16,6 +16,10 @@
 #include <algorithm>
 #include "lite/core/kernel.h"
 #include "lite/operators/argmax_op.h"
+#ifdef LITE_WITH_PROFILE
+#include <string>
+#include "lite/core/profile/profiler.h"
+#endif

 namespace paddle {
 namespace lite {
@@ -29,6 +33,14 @@ class ArgmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  void Run() override;

  virtual ~ArgmaxCompute() = default;
+
+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+  std::string kernel_func_name_{"NotImplForArgmax"};
+#endif
 };

 }  // namespace arm

--- a/lite/kernels/arm/calib_compute.cc
+++ b/lite/kernels/arm/calib_compute.cc
@@ -33,6 +33,17 @@ void CalibComputeFp32ToInt8<DLType>::Run() {
      din, dout, scale.data(), 1, 1, param.input->numel());
 }

+template <DataLayoutType DLType>
+void CalibComputeInt64ToInt32<DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<int64_t>();
+  std::vector<float> scale = {param.scale};
+  auto* dout = param.output->template mutable_data<int32_t>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = din[i];
+  }
+}
+
 template <DataLayoutType DLType>
 void CalibComputeInt8ToFp32<DLType>::Run() {
  auto& param = this->template Param<operators::CalibParam>();
@@ -105,6 +116,23 @@ REGISTER_LITE_KERNEL(
                                       DATALAYOUT(kNHWC))})
    .Finalize();

+REGISTER_LITE_KERNEL(
+    calib,
+    kARM,
+    kInt64,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeInt64ToInt32<DATALAYOUT(kNCHW)>,
+    int64_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
    calib_once,
    kARM,
@@ -161,3 +189,20 @@ REGISTER_LITE_KERNEL(
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNHWC))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once,
+    kARM,
+    kInt64,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeInt64ToInt32<DATALAYOUT(kNCHW)>,
+    int64_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
--- a/lite/kernels/arm/calib_compute.h
+++ b/lite/kernels/arm/calib_compute.h
@@ -34,6 +34,19 @@ class CalibComputeFp32ToInt8
 private:
 };

+template <DataLayoutType DLType>
+class CalibComputeInt64ToInt32
+    : public KernelLite<TARGET(kARM), PRECISION(kInt64), DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt64ToInt32() override{};
+
+ private:
+};
+
 template <DataLayoutType DLType>
 class CalibComputeInt8ToFp32
    : public KernelLite<TARGET(kARM), PRECISION(kInt8), DLType> {

--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -62,8 +62,19 @@ void CastCompute::Run() {
    int32_t* out_data = param.Out->mutable_data<int32_t>();
    std::transform(
        x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
+  } else if (param.in_dtype == 0 && param.out_dtype == 5) {  // bool->fp32
+    const bool* x_data_begin = param.X->data<bool>();
+    const bool* x_data_end = x_data_begin + param.X->numel();
+    float* out_data = param.Out->mutable_data<float>();
+    std::transform(x_data_begin, x_data_end, out_data, TransOp<bool, float>);
+  } else if (param.in_dtype == 3 && param.out_dtype == 5) {  // int64->fp32
+    const int64_t* x_data_begin = param.X->data<int64_t>();
+    const int64_t* x_data_end = x_data_begin + param.X->numel();
+    float* out_data = param.Out->mutable_data<float>();
+    std::transform(x_data_begin, x_data_end, out_data, TransOp<int64_t, float>);
  } else {
-    LOG(FATAL) << "other has not been implemented";
+    LOG(FATAL) << "other has not been implemented transform with dtype"
+               << param.in_dtype << " X, dtype" << param.out_dtype << " Out";
  }
 }


--- a/lite/kernels/arm/conv_compute.h
+++ b/lite/kernels/arm/conv_compute.h
@@ -15,6 +15,9 @@
 #pragma once
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/kernel.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif

 namespace paddle {
 namespace lite {
@@ -36,6 +39,13 @@ class ConvCompute : public KernelLite<TARGET(kARM), Ptype> {
    impl_->Run();
  }

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    impl_->SetProfileRuntimeKernelInfo(ch);
+  }
+#endif
+
  ~ConvCompute() {
    if (impl_ != nullptr) {
      delete impl_;

--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -50,6 +50,9 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
      flag_trans_weights_ = true;
    }
    impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_3x3_fp32";
+#endif
  } else if (kw == 5) {
    // VLOG(5) << "invoke 5x5 dw conv fp32";
    auto strides = param.strides;
@@ -67,6 +70,9 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
          w_data_in, w_data, oc, 1, cblock, kh * kw);
      flag_trans_weights_ = true;
      impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
+#ifdef LITE_WITH_PROFILE
+      kernel_func_name_ = "conv_depthwise_5x5_fp32";
+#endif
    } else {
      LOG(FATAL)
          << "5x5 depthwise conv only support stride == 1 or stride == 2";
@@ -103,6 +109,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
    // trans weights
    // VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_3x3_int8_fp32";
+#endif
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
    auto wptr = param.filter->data<int8_t>();
@@ -113,6 +122,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
    // trans weights
    // VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_5x5_int8_fp32";
+#endif
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
    auto wptr = param.filter->data<int8_t>();
@@ -162,6 +174,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
    // trans weights
    // VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_3x3_int8_int8";
+#endif
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
    auto wptr = param.filter->data<int8_t>();
@@ -172,6 +187,9 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
    // trans weights
    // VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_depthwise_5x5_int8_int8";
+#endif
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
    auto wptr = param.filter->data<int8_t>();
@@ -183,6 +201,14 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -225,6 +251,14 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
        w_scale_.data());
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -267,6 +301,14 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
        w_scale_.data());
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto& param = this->Param<param_t>();

--- a/lite/kernels/arm/conv_depthwise.h
+++ b/lite/kernels/arm/conv_depthwise.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <cmath>
+#include <string>
 #include <vector>
 #include "lite/backends/arm/math/conv_impl.h"
 #include "lite/core/context.h"
@@ -48,6 +49,15 @@ class DepthwiseConv : public KernelLite<TARGET(kARM), Ptype> {
  virtual void PrepareForRun();
  virtual void Run();

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+
+  std::string kernel_func_name_{"NotImplForConvDw"};
+#endif
+
 private:
  using param_t = operators::ConvParam;
  Tensor weights_;

--- a/lite/kernels/arm/conv_direct.cc
+++ b/lite/kernels/arm/conv_direct.cc
@@ -19,6 +19,14 @@ namespace lite {
 namespace kernels {
 namespace arm {

+#ifdef LITE_WITH_PROFILE
+template <>
+void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -62,6 +70,9 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                            b_data,
                                            param,
                                            &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s1_direct_fp32";
+#endif
  } else {
    lite::arm::math::conv_3x3s2_direct_fp32(i_data,
                                            o_data,
@@ -76,9 +87,20 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                            b_data,
                                            param,
                                            &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s2_direct_fp32";
+#endif
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -117,6 +139,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                                            param,
                                            &ctx,
                                            w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s1_direct_int8";
+#endif
  } else {
    lite::arm::math::conv_3x3s2_direct_int8(i_data,
                                            o_data,
@@ -132,9 +157,20 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                                            param,
                                            &ctx,
                                            w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s2_direct_int8";
+#endif
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto& param = this->Param<param_t>();
@@ -173,6 +209,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                                            param,
                                            &ctx,
                                            w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s1_direct_int8";
+#endif
  } else {
    lite::arm::math::conv_3x3s2_direct_int8(i_data,
                                            o_data,
@@ -188,6 +227,9 @@ void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                                            param,
                                            &ctx,
                                            w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_3x3s2_direct_int8";
+#endif
  }
 }


--- a/lite/kernels/arm/conv_direct.h
+++ b/lite/kernels/arm/conv_direct.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <cmath>
+#include <string>
 #include <vector>
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/context.h"
@@ -180,6 +181,15 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> {

  virtual void Run();

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+
+  std::string kernel_func_name_{"NotImplForConvDirect"};
+#endif
+
  /// todo, support inplace weights transform
 protected:
  Tensor weights_;

--- a/lite/kernels/arm/conv_gemmlike.cc
+++ b/lite/kernels/arm/conv_gemmlike.cc
@@ -81,6 +81,14 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -111,12 +119,26 @@ void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  if (flag_1x1gemm_) {
    lite::arm::math::conv1x1s1_gemm(
        din, dout, bs, oc, oh, ow, ic, ih, iw, weights, bias, param, &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv1x1s1_gemm";
+#endif
  } else {
    lite::arm::math::conv_im2col_gemm(
        din, dout, bs, oc, oh, ow, ic, ih, iw, weights, bias, param, &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_im2col_gemm";
+#endif
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -159,6 +181,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                                         param,
                                         &ctx,
                                         w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv1x1s1_gemm_int8";
+#endif
  } else {
    lite::arm::math::conv_im2col_gemm_int8(din,
                                           dout,
@@ -174,9 +199,20 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
                                           param,
                                           &ctx,
                                           w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_im2col_gemm_int8";
+#endif
  }
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto& param = this->Param<param_t>();
@@ -219,6 +255,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                                         param,
                                         &ctx,
                                         w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv1x1s1_gemm_int8";
+#endif
  } else {
    lite::arm::math::conv_im2col_gemm_int8(din,
                                           dout,
@@ -234,6 +273,9 @@ void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                                           param,
                                           &ctx,
                                           w_scale_.data());
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_im2col_gemm_int8";
+#endif
  }
 }


--- a/lite/kernels/arm/conv_gemmlike.h
+++ b/lite/kernels/arm/conv_gemmlike.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <cmath>
+#include <string>
 #include <vector>
 #include "lite/backends/arm/math/conv_impl.h"
 #include "lite/backends/arm/math/funcs.h"
@@ -94,6 +95,15 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
  virtual void PrepareForRun();
  virtual void Run();

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+
+  std::string kernel_func_name_{"NotImplForConvGemm"};
+#endif
+
  /// todo, support inplace weights transform
 protected:
  using param_t = operators::ConvParam;

--- a/lite/kernels/arm/conv_transpose_compute.h
+++ b/lite/kernels/arm/conv_transpose_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <string>
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/kernel.h"
 #include "lite/operators/conv_transpose_op.h"
@@ -33,6 +34,14 @@ class Conv2DTransposeCompute

  ~Conv2DTransposeCompute() = default;

+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+  std::string kernel_func_name_{"NotImplForConvTranspose"};
+#endif
+
 protected:
  int workspace_size_{0};
 };

--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -93,6 +93,14 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
  ReInitWhenNeeded();
 }

+#ifdef LITE_WITH_PROFILE
+template <>
+void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::
+    SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+  ch->kernel_func_name = kernel_func_name_;
+}
+#endif
+
 template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
  auto& param = this->Param<param_t>();
@@ -129,6 +137,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                          b_data,
                                          param,
                                          &ctx);
+#ifdef LITE_WITH_PROFILE
+    kernel_func_name_ = "conv_compute_6x6_3x3";
+#endif
  } else {
    int tile_block = 8;
    int block_count =
@@ -147,6 +158,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                            b_data,
                                            param,
                                            &ctx);
+#ifdef LITE_WITH_PROFILE
+      kernel_func_name_ = "conv_compute_2x2_3x3";
+#endif
    } else {
      lite::arm::math::conv_compute_2x2_3x3_small(i_data,
                                                  o_data,
@@ -161,6 +175,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                                  b_data,
                                                  param,
                                                  &ctx);
+#ifdef LITE_WITH_PROFILE
+      kernel_func_name_ = "conv_compute_2x2_3x3_small";
+#endif
    }
  }
 }

--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
@@ -35,6 +35,13 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
  virtual void PrepareForRun();
  virtual void ReInitWhenNeeded();
  virtual void Run();
+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+  std::string kernel_func_name_{"NotImplForConvWino"};
+#endif

 protected:
  using param_t = operators::ConvParam;

--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -300,11 +300,12 @@ void ElementwiseMaxActivationCompute::Run() {
  }
 }

-void ElementwiseDivCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
+template <typename T, PrecisionType PType>
+void ElementwiseDivCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ElementwiseParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
  int axis = param.axis;
  auto x_dims = param.X->dims();
  auto y_dims = param.Y->dims();
@@ -313,10 +314,10 @@ void ElementwiseDivCompute::Run() {
    LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
  }
  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_div_broadcast(
+    lite::arm::math::elementwise_div_broadcast<T>(
        x_data, y_data, out_data, pre, n, post);
  } else {
-    lite::arm::math::elementwise_div(
+    lite::arm::math::elementwise_div<T>(
        x_data, y_data, out_data, x_dims.production());
  }
 }
@@ -465,17 +466,27 @@ REGISTER_LITE_KERNEL(
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-REGISTER_LITE_KERNEL(elementwise_div,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseDivCompute,
-                     def)
+using elementwise_div_fp32 =
+    paddle::lite::kernels::arm::ElementwiseDivCompute<float, PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(
+    elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

+using elementwise_div_int64 =
+    paddle::lite::kernels::arm::ElementwiseDivCompute<int64_t,
+                                                      PRECISION(kInt64)>;
+
+REGISTER_LITE_KERNEL(
+    elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
    fusion_elementwise_div_activation,
    kARM,

--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -86,8 +86,8 @@ class ElementwiseMaxActivationCompute
  virtual ~ElementwiseMaxActivationCompute() = default;
 };

-class ElementwiseDivCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ElementwiseDivCompute : public KernelLite<TARGET(kARM), PType> {
 public:
  void Run() override;


--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -73,7 +73,6 @@ void GatherCompute::Run() {
 REGISTER_LITE_KERNEL(
    gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindInput("Index",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Index", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .Finalize();
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -34,6 +34,9 @@ add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc
 add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
+
 # extra
 # wait to add ...

@@ -73,6 +76,12 @@ lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
 lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
             DEPS layout_opencl op_registry program context)

+lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc
+             DEPS pixel_shuffle_opencl op_registry program context)
+
+lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
+             DEPS expand_opencl op_registry program context)
+
 lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
 lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc

--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
@@ -18,6 +18,10 @@
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -63,16 +67,24 @@ class ReluCompute

    auto global_work_size = cl::NDRange{count};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  std::string kernel_func_name_{"relu"};
  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
@@ -120,16 +132,24 @@ class SigmoidCompute

    auto global_work_size = cl::NDRange{count};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  std::string kernel_func_name_{"sigmoid"};
  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};

--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -19,6 +19,10 @@
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -148,16 +152,24 @@ class ActivationComputeImageDefault
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size_,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size_,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  param_t* act_param_{nullptr};
  DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(

--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -23,6 +23,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/logging.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -142,13 +146,13 @@ class BilinearInterpImageCompute
                    static_cast<cl::size_type>(default_work_size[1]),
                    static_cast<cl::size_type>(default_work_size[2])};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
 #ifdef LITE_WITH_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "

--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
@@ -23,6 +23,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/logging.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -121,13 +125,13 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
          cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
                      static_cast<cl::size_type>(default_work_size[2])};

-      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-          kernel,
-          cl::NullRange,
-          global_work_size,
-          cl::NullRange,
-          nullptr,
-          nullptr);
+      status = EnqueueNDRangeKernel(context,
+                                    kernel,
+                                    cl::NullRange,
+                                    global_work_size,
+                                    cl::NullRange,
+                                    nullptr,
+                                    event_);
      CL_CHECK_FATAL(status);

 #ifdef LITE_WITH_LOG
@@ -138,6 +142,14 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
  }
  std::string doc() { return "Boxcoder using cl::Image, kFP16"; }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
  param_t* boxcoder_param_{nullptr};
  std::string kernel_func_name_{};
  std::string build_options_{" -DCL_DTYPE_half"};

--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -18,6 +18,10 @@
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -124,13 +128,13 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
      status = kernel.setArg(++arg_idx, total1);
      CL_CHECK_FATAL(status);

-      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-          kernel,
-          cl::NullRange,
-          global_work_size,
-          cl::NullRange,
-          nullptr,
-          nullptr);
+      status = EnqueueNDRangeKernel(context,
+                                    kernel,
+                                    cl::NullRange,
+                                    global_work_size,
+                                    cl::NullRange,
+                                    nullptr,
+                                    event_);
      CL_CHECK_FATAL(status);
    } else {
      auto start = 0;
@@ -157,13 +161,13 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
        status = kernel.setArg(++arg_idx, total0);
        CL_CHECK_FATAL(status);

-        status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-            kernel,
-            cl::NullRange,
-            global_work_size,
-            cl::NullRange,
-            nullptr,
-            nullptr);
+        status = EnqueueNDRangeKernel(context,
+                                      kernel,
+                                      cl::NullRange,
+                                      global_work_size,
+                                      cl::NullRange,
+                                      nullptr,
+                                      event_);
        CL_CHECK_FATAL(status);
        start += size;
      }
@@ -172,6 +176,14 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),

  std::string doc() { return "Concat using cl::Buffer, kFloat"; }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
  int axis_size_ = 1;
  int post_size_ = 1;
  int pre_size_ = 1;

--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -19,6 +19,10 @@
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -246,6 +250,14 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),

  std::string doc() { return "Concat using cl::Image, kFP16"; }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
  int axis_size_ = 1;
  int axis_ = 1;
  int flag_ = 1;

--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
@@ -23,6 +23,10 @@
 #include "lite/core/tensor.h"
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -39,6 +43,14 @@ class ConvCompute

  void Run() override;

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_names_[0];
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  void GemmlikeConv2d();
  void Conv2d1x1();

--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -24,11 +24,16 @@
 #include "lite/core/tensor.h"
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
+
 class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
                                           PRECISION(kFP16),
                                           DATALAYOUT(kImageDefault)> {
@@ -38,20 +43,37 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),

  void PrepareForRun() override;

+  void ReInitWhenNeeded() override;
+
  void Run() override;
-  double Turn(int times = 5);
+
+  double Tune(int times = 5);
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_names_[0];
+    ch->global_work_size = ch->NDRangeToStr(global_work_size_);
+    ch->local_work_size = ch->NDRangeToStr(local_work_size_);
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif

 private:
-  void Conv2d1x1opt(bool is_turn = false);
-  void Conv2d3x3(bool is_turn = false);
-  void Conv2d3x3opt(bool is_turn = false);
-  void Conv2d5x5(bool is_turn = false);
-  void Conv2d5x5opt(bool is_turn = false);
-  void Conv2d7x7(bool is_turn = false);
-  void Conv2d7x7opt(bool is_turn = false);
-  void DepthwiseConv2d3x3s1(bool is_turn = false);
-  void DepthwiseConv2d3x3(bool is_turn = false);
-  void DepthwiseConv2d(bool is_turn = false);
+  void PrintConvInfo();
+  void GetGlobalWorkSize();
+  void Conv2d1x1opt(bool enable_tune = false);
+  void Conv2d3x3(bool enable_tune = false);
+  void Conv2d3x3opt(bool enable_tune = false);
+  void Conv2d5x5(bool enable_tune = false);
+  void Conv2d5x5opt(bool enable_tune = false);
+  void Conv2d7x7(bool enable_tune = false);
+  void Conv2d7x7opt(bool enable_tune = false);
+  void DepthwiseConv2d3x3s1(bool enable_tune = false);
+  void DepthwiseConv2d3x3(bool enable_tune = false);
+  void DepthwiseConv2d(bool enable_tune = false);
+
+  param_t* conv_param_{nullptr};

  kernel_t impl_;
  std::vector<std::string> kernel_func_names_{};
@@ -65,19 +87,72 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
  std::unique_ptr<Tensor> tensor_hold_bias_image_{nullptr};
  cl::NDRange global_work_size_ = cl::NDRange{
      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+
+  // opencl kernel args
  int c_blk_ = 1;
  int w_blk_ = 1;
  int nh_blk_ = 1;

+  const cl::Image2D* input_image_p_{nullptr};
+  const cl::Image2D* filter_image_p_{nullptr};
+  const cl::Image2D* bias_image_p_{nullptr};
+  const cl::Image2D* output_image_p_{nullptr};
+
+  int stride_h_{-1};
+  int stride_w_{-1};
+
+  int dilation_h_{-1};
+  int dilation_w_{-1};
+
+  int pad_up_{-1};
+  int pad_down_{-1};
+  int pad_left_{-1};
+  int pad_right_{-1};
+
+  int offset_{-1};
+  int groups_{-1};
+  bool relu_fused_{false};
+  bool has_bias_{false};
+
+  int input_tensor_n_{-1};
+  int input_tensor_c_{-1};
+  int input_tensor_h_{-1};
+  int input_tensor_w_{-1};
+  int input_image_h_{-1};
+  int input_image_w_{-1};
+  int input_c_block_{-1};
+
+  int output_tensor_n_{-1};
+  int output_tensor_c_{-1};
+  int output_tensor_h_{-1};
+  int output_tensor_w_{-1};
+  int output_image_h_{-1};
+  int output_image_w_{-1};
+
+  int filter_tensor_n_{-1};
+  int filter_tensor_c_{-1};
+  int filter_tensor_h_{-1};
+  int filter_tensor_w_{-1};
+  int filter_image_h_{-1};
+  int filter_image_w_{-1};
+
+  int bias_image_h_{-1};
+  int bias_image_w_{-1};
+
  int default_c_blk_ = 1;
  int default_w_blk_ = 1;
  int default_nh_blk_ = 1;
+  // =================
+
+  DDim last_input_dims_{};
+  bool is_first_epoch_for_run_{true};

  cl::Kernel kernel_;
+  cl_int status_;
  cl::NDRange local_work_size_ = cl::NDRange{
      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
  bool use_lws_{true};
-  bool use_turn_{false};
+  bool use_tune_{false};
 };

 }  // namespace opencl

--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
@@ -20,6 +20,10 @@
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -119,6 +123,14 @@ class DepthwiseConv2dCompute
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  std::string kernel_func_name_{"depthwise_conv2d"};
  std::string build_options_{"-DCL_DTYPE_float"};

--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
@@ -21,6 +21,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -89,16 +93,24 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
                    static_cast<cl::size_type>(default_work_size.data()[1]),
                    static_cast<cl::size_type>(default_work_size.data()[2])};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  std::string kernel_func_name_{"dropout"};
  std::string build_options_{"-DCL_DTYPE_half"};

--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
@@ -19,6 +19,10 @@
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -38,6 +42,14 @@ class ElementwiseAddCompute
    return "ElementwiseAdd using cl::Buffer, kFloat";
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 protected:
  void UpdateParams();


--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -18,6 +18,8 @@
 #include "lite/core/op_registry.h"
 #include "lite/utils/replace_stl/stream.h"

+#undef LITE_WITH_LOG
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -154,13 +156,13 @@ void ElementwiseAddImageCompute::Run() {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);

-  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
-      cl::NullRange,
-      global_work_size_,
-      cl::NullRange,
-      nullptr,
-      nullptr);
+  status = EnqueueNDRangeKernel(context,
+                                kernel,
+                                cl::NullRange,
+                                global_work_size_,
+                                cl::NullRange,
+                                nullptr,
+                                event_);
  CL_CHECK_FATAL(status);
 }

@@ -196,3 +198,5 @@ REGISTER_LITE_KERNEL(elementwise_add,
                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
    .Finalize();
+
+#define LITE_WITH_LOG
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
@@ -21,6 +21,10 @@
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -42,6 +46,14 @@ class ElementwiseAddImageCompute

  void Run() override;

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
  std::string doc() const override {
    return "ElementwiseAdd using cl::Image2D, kFP16";
  }

--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
@@ -153,13 +153,13 @@ void ElementwiseMulFloatImageCompute::Run() {
  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                      static_cast<cl::size_type>(x_img_height)};

-  auto  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel
-      kernel,
-      cl::NullRange,
-      global_work_size,
-      cl::NullRange,
-      nullptr,
-      nullptr);
+  auto status = EnqueueNDRangeKernel(context,
+                                     kernel,
+                                     cl::NullRange,
+                                     global_work_size,
+                                     cl::NullRange,
+                                     nullptr,
+                                     event_);
  CL_CHECK_FATAL(status);
  std::string time_stamp_{GetTimeStamp()};


--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -23,6 +23,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/logging.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -186,13 +190,13 @@ class ElementwiseMulImageCompute
        cl::NDRange{static_cast<cl::size_type>(x_img_width),
                    static_cast<cl::size_type>(x_img_height)};

-    auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    auto status = EnqueueNDRangeKernel(context,
+                                       kernel,
+                                       cl::NullRange,
+                                       global_work_size,
+                                       cl::NullRange,
+                                       nullptr,
+                                       event_);
    CL_CHECK_FATAL(status);
 #ifdef LITE_WITH_LOG
    VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;

--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -138,8 +138,13 @@ void ElementwiseSubImageCompute::Run() {
  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
 #endif

-  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
+  auto status = EnqueueNDRangeKernel(context,
+                                     kernel,
+                                     cl::NullRange,
+                                     global_work_size,
+                                     cl::NullRange,
+                                     nullptr,
+                                     event_);
  CL_CHECK_FATAL(status);
 }


--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
@@ -20,6 +20,10 @@
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -41,6 +45,14 @@ class ElementwiseSubImageCompute
    return "ElementwiseSub using cl::Image2D, kFP16";
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 protected:
  param_t* ele_param_{nullptr};
  std::string kernel_func_name_{"elementwise_sub"};

--- a/lite/kernels/opencl/expand_image_compute.cc
+++ b/lite/kernels/opencl/expand_image_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ExpandComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                               PRECISION(kFP16),
+                                               DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ExpandParam;
+
+  std::string doc() const override { return "expand using cl::Image2D, kFP16"; }
+
+  void PrepareForRun() override {
+    expand_param_ = param_.get_mutable<param_t>();
+    auto expand_times = expand_param_->expand_times;
+    auto in_dims = expand_param_->X->dims();
+    CHECK(in_dims.size() == 4) << "expand image now only support indims size 4";
+    CHECK(expand_times.size() == 4)
+        << "expand image now only support in_expand_timesdims size 4";
+    CHECK(expand_times[1] == 1) << "expand image do not support expend c now";
+
+    // do not confuse with these cases.it is use to support expend c in future
+    if (in_dims[1] == 1) {
+      kernel_func_name_ = "expend_c1";
+    } else if (in_dims[1] == 2) {
+      kernel_func_name_ = "expend_c2";
+    } else if (in_dims[1] == 3) {
+      kernel_func_name_ = "expend_c3";
+    } else if (in_dims[1] == 4) {
+      kernel_func_name_ = "expend_c4";
+    } else {
+      kernel_func_name_ = "expend_cn";
+    }
+
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/expand_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
+
+  void ReInitWhenNeeded() override {
+    VLOG(1) << "ReInitWhenNeeded:  " << kernel_func_name_;
+
+    auto x_dims = expand_param_->X->dims();
+    auto out_dims = expand_param_->Out->dims();
+    auto expand_times = expand_param_->expand_times;
+
+    VLOG(1) << "x_dims:  " << x_dims;
+    VLOG(1) << "out_dims:  " << out_dims;
+    VLOG(1) << "expand_times:  " << expand_times[0] << " " << expand_times[1]
+            << " " << expand_times[2] << " " << expand_times[3];
+
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ = default_convertor.InitImageDimInfoWith(out_dims);
+      VLOG(1) << "out_img_shape_:  " << out_img_shape_[0] << "  "
+              << out_img_shape_[1];
+
+      // compute global work size
+      auto image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
+      size_t work_size_0 = image_width / out_dims[3];
+      size_t work_size_1 = out_dims[3];
+      size_t work_size_2 = out_dims[0] * out_dims[2];
+      global_work_size_ = cl::NDRange{work_size_0, work_size_1, work_size_2};
+      VLOG(1) << "global_work_size_:  " << global_work_size_[0] << " "
+              << global_work_size_[1] << " " << global_work_size_[2];
+    }
+  }
+
+  void Run() override {
+    auto* x_img = expand_param_->X->data<half_t, cl::Image2D>();
+    auto* out_img = expand_param_->Out->mutable_data<half_t, cl::Image2D>(
+        out_img_shape_[0], out_img_shape_[1]);
+    auto expand_times = expand_param_->expand_times;
+
+    auto x_dims = expand_param_->X->dims();
+
+    int in_n = x_dims[0];
+    int in_c = x_dims[1];
+    int in_h = x_dims[2];
+    int in_w = x_dims[3];
+
+    auto out_dims = expand_param_->Out->dims();
+
+    int out_n = out_dims[0];
+    int out_c = out_dims[1];
+    int out_h = out_dims[2];
+    int out_w = out_dims[3];
+
+    auto out_image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
+    int out_c_block = out_image_width / out_dims[3];
+    int out_nh = out_dims[0] * out_dims[2];
+
+    auto in_image_width = x_dims[3] * ((x_dims[1] + 3) / 4);
+    int in_c_block = in_image_width / x_dims[3];
+    int in_nh = x_dims[0] * x_dims[2];
+
+    int expand_times_n = expand_times[0];
+    int expand_times_c = expand_times[1];
+    int expand_times_h = expand_times[2];
+    int expand_times_w = expand_times[3];
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto kernel = kernel_;
+    cl_int status;
+    status = kernel.setArg(0, out_c_block);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(1, out_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(2, out_nh);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(3, in_c_block);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(4, in_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(5, in_nh);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(6, in_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(7, in_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(8, out_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(9, out_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(10, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(11, *out_img);
+    CL_CHECK_FATAL(status);
+
+    status = kernel.setArg(12, expand_times_n);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(13, expand_times_c);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(14, expand_times_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(15, expand_times_w);
+    CL_CHECK_FATAL(status);
+
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size_,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
+    CL_CHECK_FATAL(status);
+  }
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+ private:
+  std::string kernel_func_name_{};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
+
+  param_t* expand_param_{nullptr};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(expand,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ExpandComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/expand_image_compute_test.cc
+++ b/lite/kernels/opencl/expand_image_compute_test.cc
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -20,6 +20,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -141,16 +145,24 @@ class FcCompute
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size_,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size_,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  int m_, n_, k_;
  param_t* fc_param_{nullptr};

--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -23,6 +23,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/logging.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -131,16 +135,24 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size_,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size_,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 protected:
  param_t* grid_param_{nullptr};
  bool first_epoch_for_reinit_{true};

--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -23,6 +23,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/logging.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -137,13 +141,13 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
    status = kernel.setArg(7, *out_img);
    CL_CHECK_FATAL(status);

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        local_work_size,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  local_work_size,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

@@ -258,17 +262,25 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
    status = kernel.setArg(arg_idx++, in_w);
    CL_CHECK_FATAL(status);

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        local_work_size,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  local_work_size,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }
 #endif

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 protected:
  param_t* instance_norm_param_{nullptr};
  std::string kernel_func_name_{"instance_norm_onnx"};

--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -16,19 +16,46 @@
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"

+#undef LITE_WITH_LOG
+
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {

+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
 // Host to OpenCL memory.
-void CopyFromHostSync(void* target, const void* source, size_t size) {
+float CopyFromHostSync(void* target, const void* source, size_t size) {
+#ifdef LITE_WITH_PROFILE
+  auto h2d_copy_start = GetCurrentUS();
+#endif
  TargetWrapperCL::MemcpySync(target, source, size, IoDirection::HtoD);
+#ifdef LITE_WITH_PROFILE
+  auto h2d_duration = (GetCurrentUS() - h2d_copy_start) / 1000.0;
+  return h2d_duration;
+#else
+  return 0.0;
+#endif
 }

 // Device to Host memory.
-void CopyToHostSync(void* target, const void* source, size_t size) {
+float CopyToHostSync(void* target, const void* source, size_t size) {
+#ifdef LITE_WITH_PROFILE
+  auto d2h_copy_start = GetCurrentUS();
+#endif
+  CLRuntime::Global()->command_queue().finish();
  TargetWrapperCL::MemcpySync(target, source, size, IoDirection::DtoH);
+#ifdef LITE_WITH_PROFILE
+  auto d2h_duration = (GetCurrentUS() - d2h_copy_start) / 1000.0;
+  return d2h_duration;
+#else
+  return 0.0;
+#endif
 }

 /*
@@ -37,6 +64,13 @@ void CopyToHostSync(void* target, const void* source, size_t size) {
 class IoCopyHostToOpenCLCompute
    : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
 public:
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = "HostToOpenCL";
+    ch->io_duration = h2d_duration_;
+  }
+#endif
+
  void Run() override {
    auto& param = Param<operators::IoCopyParam>();
    CHECK(param.x->target() == TARGET(kHost) ||
@@ -50,7 +84,7 @@ class IoCopyHostToOpenCLCompute
    VLOG(2) << "param.y->dims():" << param.y->dims();
 #endif
    auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size);
-    CopyFromHostSync(data, param.x->raw_data(), mem_size);
+    h2d_duration_ = CopyFromHostSync(data, param.x->raw_data(), mem_size);
  }

  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
@@ -74,6 +108,8 @@ class IoCopyHostToOpenCLCompute
  }

  std::string doc() const override { return "Copy IO from HOST to OpenCL"; }
+
+  float h2d_duration_{0};
 };

 /*
@@ -82,6 +118,13 @@ class IoCopyHostToOpenCLCompute
 class IoCopykOpenCLToHostCompute
    : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
 public:
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = "OpenCLToHost";
+    ch->io_duration = d2h_duration_;
+  }
+#endif
+
  void Run() override {
    auto& param = Param<operators::IoCopyParam>();
    CHECK(param.x->target() == TARGET(kOpenCL));
@@ -109,12 +152,13 @@ class IoCopykOpenCLToHostCompute
 #ifdef LITE_WITH_LOG
    VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
 #endif
-    CLRuntime::Global()->command_queue().finish();

-    CopyToHostSync(data, param.x->raw_data(), mem_size);
+    d2h_duration_ = CopyToHostSync(data, param.x->raw_data(), mem_size);
  }

  std::string doc() const override { return "Copy IO from OpenCL to HOST"; }
+
+  float d2h_duration_{0};
 };

 }  // namespace opencl
@@ -161,3 +205,5 @@ REGISTER_LITE_KERNEL(io_copy_once,
    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
    .Finalize();
+
+#define LITE_WITH_LOG
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include "lite/api/paddle_place.h"
 #include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_utility.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/target_wrapper.h"
@@ -24,6 +25,8 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"

+#undef LITE_WITH_LOG
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -50,6 +53,14 @@ class LayoutComputeBufferChwToImageDefault
                                    time_stamp_);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
  void Run() override {
    auto& param = Param<param_t>();
    const cl::Buffer* x_data;
@@ -128,13 +139,13 @@ class LayoutComputeBufferChwToImageDefault
                    static_cast<cl::size_type>(new_dims[3]),
                    static_cast<cl::size_type>(new_dims[0] * new_dims[2])};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

@@ -168,6 +179,14 @@ class LayoutComputeImageDefaultToBufferChw
                                    time_stamp_);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
  void Run() override {
    auto& param = Param<param_t>();
    const cl::Buffer* y_data;
@@ -237,13 +256,13 @@ class LayoutComputeImageDefaultToBufferChw
                    static_cast<cl::size_type>(new_dims[3]),
                    static_cast<cl::size_type>(new_dims[0] * new_dims[2])};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

@@ -274,6 +293,14 @@ class LayoutComputeBufferChwToImage2DNw
                                    time_stamp_);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
  void Run() override {
    auto& param = Param<param_t>();
    auto* x_data = param.x->data<float, cl::Buffer>();
@@ -333,13 +360,13 @@ class LayoutComputeBufferChwToImage2DNw
                    static_cast<cl::size_type>(out_W),            // w
                    static_cast<cl::size_type>(out_C * out_H)};   // ch

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

@@ -394,3 +421,4 @@ REGISTER_LITE_KERNEL(
                                       PRECISION(kAny),
                                       DATALAYOUT(kNCHW))})
    .Finalize();
+#define LITE_WITH_LOG
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -23,6 +23,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/logging.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -128,13 +132,13 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
                    static_cast<cl::size_type>(default_work_size[1]),
                    static_cast<cl::size_type>(default_work_size[2])};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
 #ifdef LITE_WITH_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
@@ -142,6 +146,14 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 protected:
  param_t* lrn_param_{nullptr};
  int n_{5};

--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
@@ -20,6 +20,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -92,16 +96,24 @@ class MulCompute
    auto global_work_size = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                        static_cast<size_t>((n_ + 3) / 4)};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  int m_, n_, k_;
  std::string kernel_func_name_{"mat_mul"};

--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -19,6 +19,10 @@
 #include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -110,16 +114,24 @@ class NearestInterpComputeImageDefault
                    static_cast<cl::size_type>(default_work_size.data()[1]),
                    static_cast<cl::size_type>(default_work_size.data()[2])};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  std::string kernel_func_name_{"nearest_interp"};
  std::string build_options_{" -DCL_DTYPE_half"};

--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -23,6 +23,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/logging.h"
 #include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -142,13 +146,13 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
                    static_cast<cl::size_type>(default_work_size[1]),
                    static_cast<cl::size_type>(default_work_size[2])};

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
 #ifdef LITE_WITH_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
@@ -156,6 +160,14 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
 #endif
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 protected:
  param_t* pad2d_param_{nullptr};
  std::string kernel_func_name_{};

--- a/lite/kernels/opencl/pixel_shuffle_image_compute.cc
+++ b/lite/kernels/opencl/pixel_shuffle_image_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class PixelShuffleComputeImage2D
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::PixelShuffleParam;
+
+  std::string doc() const override {
+    return "PixelShuffle using cl::Image2D, kFP16";
+  }
+
+  void PrepareForRun() override {
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/pixel_shuffle_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
+
+  void ReInitWhenNeeded() override {
+    VLOG(1) << "ReInitWhenNeeded:  " << kernel_func_name_;
+    pixel_shuffle_param_ = param_.get_mutable<param_t>();
+    auto x_dims = pixel_shuffle_param_->x->dims();
+    auto out_dims = pixel_shuffle_param_->output->dims();
+    VLOG(1) << "x_dims:  " << x_dims;
+    VLOG(1) << "out_dims:  " << out_dims;
+    VLOG(1) << "upscale_factor:  " << pixel_shuffle_param_->upscale_factor;
+
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ = default_convertor.InitImageDimInfoWith(
+          pixel_shuffle_param_->output->dims());
+      VLOG(1) << "out_img_shape_:  " << out_img_shape_[0] << "  "
+              << out_img_shape_[1];
+
+      // compute global work size
+      auto image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
+      size_t work_size_0 = image_width / out_dims[3];
+      size_t work_size_1 = out_dims[3];
+      size_t work_size_2 = out_dims[0] * out_dims[2];
+      global_work_size_ = cl::NDRange{work_size_0, work_size_1, work_size_2};
+      VLOG(1) << "global_work_size_:  " << global_work_size_[0] << " "
+              << global_work_size_[1] << " " << global_work_size_[2];
+    }
+  }
+
+  void Run() override {
+    auto* x_img = pixel_shuffle_param_->x->data<half_t, cl::Image2D>();
+    auto* out_img =
+        pixel_shuffle_param_->output->mutable_data<half_t, cl::Image2D>(
+            out_img_shape_[0], out_img_shape_[1]);
+
+    auto x_dims = pixel_shuffle_param_->x->dims();
+
+    int in_n = x_dims[0];
+    int in_c = x_dims[1];
+    int in_h = x_dims[2];
+    int in_w = x_dims[3];
+
+    auto out_dims = pixel_shuffle_param_->output->dims();
+
+    int out_n = out_dims[0];
+    int out_c = out_dims[1];
+    int out_h = out_dims[2];
+    int out_w = out_dims[3];
+
+    const int upscale_factor = pixel_shuffle_param_->upscale_factor;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto kernel = kernel_;
+    cl_int status;
+    status = kernel.setArg(0, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(1, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(2, in_n);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(3, in_c);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(4, in_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(5, in_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(6, out_n);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(7, out_c);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(8, out_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(9, out_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(10, upscale_factor);
+    CL_CHECK_FATAL(status);
+
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size_,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
+    CL_CHECK_FATAL(status);
+  }
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+ private:
+  std::string kernel_func_name_{"pixel_shuffle"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
+
+  param_t* pixel_shuffle_param_{nullptr};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(pixel_shuffle,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::PixelShuffleComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/pixel_shuffle_image_compute_test.cc
+++ b/lite/kernels/opencl/pixel_shuffle_image_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <random>
+
+#include <gtest/gtest.h>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+
+TEST(pixel_shuffle_image2d, compute) {
+  LOG(INFO) << "create kernel ...";
+  auto kernels = KernelRegistry::Global().Create("pixel_shuffle",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFP16),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  const int INPUT_N = 1;
+  const int INPUT_C = 4;
+  const int INPUT_H = 2;
+  const int INPUT_W = 2;
+  const int UPSCALE_FACTOR = 2;
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
+
+  lite::Tensor x, out;
+  operators::PixelShuffleParam param;
+  param.x = &x;
+  param.output = &out;
+  param.upscale_factor = UPSCALE_FACTOR;
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pixel_shuffle_context->As<OpenCLContext>()));
+
+  kernel->SetContext(std::move(pixel_shuffle_context));
+
+  const DDim in_dim =
+      DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
+  const DDim out_dim = DDim(
+      std::vector<DDim::value_type>{INPUT_N,
+                                    INPUT_C / UPSCALE_FACTOR / UPSCALE_FACTOR,
+                                    INPUT_H * UPSCALE_FACTOR,
+                                    INPUT_W * UPSCALE_FACTOR});
+  LOG(INFO) << "in_dim: " << in_dim;
+  LOG(INFO) << "UPSCALE_FACTOR: " << UPSCALE_FACTOR;
+  LOG(INFO) << "out_dim: " << out_dim;
+
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-2, 2);
+  std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
+
+  int index = 0;
+  for (auto& i : input_v) {
+    i = index++;
+  }
+  VLOG(1) << "input_v ..... ";
+  for (size_t i = 0; i < input_v.size(); i++) {
+    VLOG(10) << input_v[i];
+  }
+
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+            << x_image_shape[1];
+  std::vector<half_t> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+      x_image_shape[0], x_image_shape[1], x_image_data.data());
+  VLOG(1) << "x_image_data ..... ";
+  for (size_t i = 0; i < x_image_data.size(); i++) {
+    VLOG(10) << Half2Float(x_image_data[i]);
+  }
+  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
+                                                          out_image_shape[1]);
+  kernel->Launch();
+  CLRuntime::Global()->command_queue().finish();
+  std::vector<float> out_data_v{
+      0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15};
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  VLOG(1) << "out_image_data ..... ";
+  for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
+    VLOG(10) << Half2Float(out_image_data[i]);
+  }
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, out_image_shape, out_dim);
+
+  VLOG(1) << "out_data ..... ";
+  for (int i = 0; i < out_dim.production(); i++) {
+    VLOG(10) << out_data[i];
+  }
+
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto abs_diff = abs(out_data[i] - out_data_v[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(pixel_shuffle, kOpenCL, kFP16, kImageDefault, image2d);
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
@@ -20,6 +20,10 @@
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"

 namespace paddle {
 namespace lite {
@@ -106,16 +110,24 @@ class PoolCompute
    CL_CHECK_FATAL(status);
    auto global_work_size = cl::NDRange(static_cast<size_t>(numel));

-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
    CL_CHECK_FATAL(status);
  }

+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
 private:
  std::string kernel_func_name_{"pool_"};
  std::string build_options_{"-DCL_DTYPE_float"};

--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
--- a/lite/model_parser/cpp/block_desc.h
+++ b/lite/model_parser/cpp/block_desc.h
--- a/lite/model_parser/cpp/program_desc.h
+++ b/lite/model_parser/cpp/program_desc.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -108,6 +108,7 @@ add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.c
 add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
 add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
+add_operator(pixel_shuffle_op extra SRCS pixel_shuffle_op.cc DEPS ${op_DEPS})

 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})

--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
--- a/lite/operators/affine_channel_op.h
+++ b/lite/operators/affine_channel_op.h
--- a/lite/operators/argmax_op.h
+++ b/lite/operators/argmax_op.h
--- a/lite/operators/assign_op.h
+++ b/lite/operators/assign_op.h
--- a/lite/operators/assign_value_op.h
+++ b/lite/operators/assign_value_op.h
--- a/lite/operators/axpy_op.h
+++ b/lite/operators/axpy_op.h
--- a/lite/operators/batch_norm_op.h
+++ b/lite/operators/batch_norm_op.h
--- a/lite/operators/box_clip_op.h
+++ b/lite/operators/box_clip_op.h
--- a/lite/operators/box_coder_op.h
+++ b/lite/operators/box_coder_op.h
--- a/lite/operators/calib_op.h
+++ b/lite/operators/calib_op.h
--- a/lite/operators/compare_op.h
+++ b/lite/operators/compare_op.h
--- a/lite/operators/concat_op.h
+++ b/lite/operators/concat_op.h
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
--- a/lite/operators/increment_op.h
+++ b/lite/operators/increment_op.h
--- a/lite/operators/instance_norm_op.h
+++ b/lite/operators/instance_norm_op.h
--- a/lite/operators/interpolate_op.h
+++ b/lite/operators/interpolate_op.h
--- a/lite/operators/io_copy_op.h
+++ b/lite/operators/io_copy_op.h
--- a/lite/operators/layer_norm_op.h
+++ b/lite/operators/layer_norm_op.h
--- a/lite/operators/layout_op.h
+++ b/lite/operators/layout_op.h
--- a/lite/operators/logical_op.h
+++ b/lite/operators/logical_op.h
--- a/lite/operators/lrn_op.h
+++ b/lite/operators/lrn_op.h
--- a/lite/operators/matmul_op.h
+++ b/lite/operators/matmul_op.h
--- a/lite/operators/mean_op.h
+++ b/lite/operators/mean_op.h
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
--- a/lite/operators/negative_op.h
+++ b/lite/operators/negative_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pixel_shuffle_op.cc
+++ b/lite/operators/pixel_shuffle_op.cc
--- a/lite/operators/pixel_shuffle_op.h
+++ b/lite/operators/pixel_shuffle_op.h
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/power_op.h
+++ b/lite/operators/power_op.h
--- a/lite/operators/reduce_max_op.h
+++ b/lite/operators/reduce_max_op.h
--- a/lite/operators/reduce_mean_op.h
+++ b/lite/operators/reduce_mean_op.h
--- a/lite/operators/reduce_prod_op.h
+++ b/lite/operators/reduce_prod_op.h
--- a/lite/operators/relu_op.h
+++ b/lite/operators/relu_op.h
--- a/lite/operators/reshape_op.h
+++ b/lite/operators/reshape_op.h
--- a/lite/operators/scale_op.h
+++ b/lite/operators/scale_op.h
--- a/lite/operators/search_aligned_mat_mul_op.h
+++ b/lite/operators/search_aligned_mat_mul_op.h
--- a/lite/operators/search_fc_op.h
+++ b/lite/operators/search_fc_op.h
--- a/lite/operators/search_seq_fc_op.h
+++ b/lite/operators/search_seq_fc_op.h
--- a/lite/operators/search_seq_softmax_op.h
+++ b/lite/operators/search_seq_softmax_op.h
--- a/lite/operators/softmax_op.h
+++ b/lite/operators/softmax_op.h
--- a/lite/operators/squeeze_op.h
+++ b/lite/operators/squeeze_op.h
--- a/lite/tests/math/deformable_conv_compute_test.cc
+++ b/lite/tests/math/deformable_conv_compute_test.cc