Merge branch 'new_dev' of https://github.com/PaddleLite-EB/Paddle-Lite into fpga_pr

0cd75c62 · root · b4b8b1b4 · 7e393455 · 0cd75c62 · 0cd75c62
112 changed file
--- a/docs/Makefile
+++ b/docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -34,6 +34,8 @@ Welcome to Paddle-Lite's documentation!
  :caption: 使用指南
  :name: sec-user-guides

+  user_guides/cuda
+
 .. toctree::
  :maxdepth: 1
  :caption: 进阶使用指南

--- a/docs/user_guides/cuda.md
+++ b/docs/user_guides/cuda.md
+# Lite基于CUDA的模型预测
+
+Lite支持在x86_64，arm64架构上（如：TX2）进行CUDA的编译运行。
+
+## 编译
+
+**NOTE：** 如果是在TX2等NVIDIA嵌入式硬件上编译，请使用最新的[Jetpack](https://developer.nvidia.com/embedded/jetpack) 安装依赖库。
+
+
+一： 下载代码
+
+```
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+```
+
+二：编译
+
+```
+# 进入代码目录
+cd Paddle-Lite
+
+# 运行编译脚本
+# 编译结束会在本目录下生成 build_cuda 目录
+# 编译过程中如果提示找不到CUDA，CUDNN，请在环境变量设置CUDA_TOOLKIT_ROOT_DIR, CUDNN_ROOT
+# CUDA_TOOLKIT_ROOT_DIR，CUDNN_ROOT分别表示CUDA，CUDNN的根目录
+./lite/tools/build.sh cuda
+# 如果使用python接口，需要打开build_python选项
+./lite/tools/build.sh --build_python=ON cuda
+```
+
+编译结束会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+
+## 运行
+
+以下以Yolov3模型为例，介绍如何在Nvidia GPU硬件上运行模型。
+
+一： 下载darknet_yolov3模型，模型信息请参考[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/yolov3)
+
+
+```
+# 下载模型
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
+tar -zxf yolov3_infer.tar.gz
+# 下载图片样例
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
+```
+
+二： 运行   
+
+**NOTE:**此处示例使用的是python接口，后续会开放C++接口以及示例。
+
+``` python
+#-*- coding: utf-8 -*-
+from __future__ import print_function
+import sys
+import numpy as np
+import cv2
+sys.path.append('build_cuda/inference_lite_lib/python/lib')
+from lite_core import *
+
+def read_img(im_path, resize_h, resize_w):
+  im = cv2.imread(im_path).astype('float32')
+  im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+  h, w, _ = im.shape
+  im_scale_x = resize_h / float(w)
+  im_scale_y = resize_w / float(h)
+  out_img = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=cv2.INTER_CUBIC)
+  mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, -1))
+  std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, -1))
+  out_img = (out_img / 255.0 - mean) / std
+  out_img = out_img.transpose((2, 0, 1))
+  return out_img
+
+# 配置config
+a = CxxConfig()
+a.set_model_file('./yolov3_infer/__model__') # 指定模型文件路径 
+a.set_param_file('./yolov3_infer/__params__') # 指定参数文件路径
+place_cuda = Place(TargetType.CUDA)
+a.set_valid_places([place_cuda])
+
+# 创建predictor
+predictor = create_paddle_predictor(a)
+
+# 设置输入
+input_tensor = predictor.get_input(0);
+height, width = 608, 608
+input_tensor.resize([1, 3, height, width])
+data = read_img('./kite.jpg', height, width).flatten()
+input_tensor.set_float_data(data, TargetType.CUDA)
+
+in2 = predictor.get_input(1);
+in2.resize([1, 2])
+in2.set_int32_data([height, width], TargetType.CUDA)
+
+# 运行
+predictor.run()
+
+# 获取输出
+output_tensor = predictor.get_output(0);
+
+print (output_tensor.shape())
+# [100L, 6L]
+print (output_tensor.target())
+# TargetType.Host
+print (output_tensor.float_data()[:6])
+# [0.0, 0.9862784743309021, 98.51927185058594, 471.2381286621094, 120.73092651367188, 578.33251953125]
+
+```
+
+**NOTE：** 对CUDA的支持还在持续开发中。
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -232,6 +232,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -251,6 +253,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
            )
            add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
        endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
  lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
  lite_cc_library(place SRCS paddle_place.cc DEPS glog)
@@ -218,20 +218,11 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
            --model_dir=${LITE_MODEL_DIR}/resnet50 SERIAL)
    add_dependencies(test_resnet50 extern_lite_download_resnet50_tar_gz)

-    lite_cc_test(test_resnet50_fpga SRCS resnet50_test_fpga.cc
+    lite_cc_test(test_ssd_fpga SRCS test_ssd_fpga.cc
       DEPS ${lite_model_test_DEPS}
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels})

-    lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
-       DEPS ${lite_model_test_DEPS}
-       CL_DEPS ${opencl_kernels}
-       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
-            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
-    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
-   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
-   #    DEPS ${lite_model_test_DEPS})
-
   # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
   #    DEPS ${lite_model_test_DEPS}
   #    CL_DEPS ${opencl_kernels}
@@ -296,10 +287,10 @@ if (LITE_ON_TINY_PUBLISH)
 endif()

 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-    message(STATUS "Compiling model_optimize_tool")
-    lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
+    message(STATUS "Compiling opt")
+    lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
        DEPS gflags kernel op optimizer mir_passes utils)
-    add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
+    add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)

 lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light

--- a/lite/api/android/jni/native/convert_util_jni.h
+++ b/lite/api/android/jni/native/convert_util_jni.h
@@ -181,6 +181,7 @@ inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
  MobileConfig config;

  // set model dir
+  // NOTE: This is a deprecated API and will be removed in latter release.
  jmethodID model_dir_method = env->GetMethodID(
      mobileconfig_jclazz, "getModelDir", "()Ljava/lang/String;");
  jstring java_model_dir =
@@ -190,6 +191,27 @@ inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
    config.set_model_dir(cpp_model_dir);
  }

+  // set model from file
+  jmethodID model_file_method = env->GetMethodID(
+      mobileconfig_jclazz, "getModelFromFile", "()Ljava/lang/String;");
+  jstring java_model_file =
+      (jstring)env->CallObjectMethod(jmobileconfig, model_file_method);
+  if (java_model_file != nullptr) {
+    std::string cpp_model_file = jstring_to_cpp_string(env, java_model_file);
+    config.set_model_from_file(cpp_model_file);
+  }
+
+  // set model from buffer
+  jmethodID model_buffer_method = env->GetMethodID(
+      mobileconfig_jclazz, "getModelFromBuffer", "()Ljava/lang/String;");
+  jstring java_model_buffer =
+      (jstring)env->CallObjectMethod(jmobileconfig, model_buffer_method);
+  if (java_model_buffer != nullptr) {
+    std::string cpp_model_buffer =
+        jstring_to_cpp_string(env, java_model_buffer);
+    config.set_model_from_buffer(cpp_model_buffer);
+  }
+
  // set threads
  jmethodID threads_method =
      env->GetMethodID(mobileconfig_jclazz, "getThreads", "()I");

--- a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
+++ b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
@@ -64,6 +64,44 @@ public class MobileConfig extends ConfigBase {
        return powerMode.value();
    }

+    /**
+     * Set model from file.
+     *
+     * @return
+     */
+    public void setModelFromFile(String modelFile) {
+        this.liteModelFile = modelFile;
+    }
+
+    /**
+     * Returns name of model_file.
+     *  
+     * @return liteModelFile
+     */
+    public String getModelFile() {
+        return liteModelFile;
+    }
+
+    /**
+     * Set model from buffer.
+     *
+     * @return
+     */
+    public void setModelFromBuffer(String modelBuffer) {
+        this.liteModelBuffer = modelBuffer;
+    }
+
+    /**
+     * Returns model buffer
+     *  
+     * @return liteModelBuffer
+     */
+    public String getModelBuffer() {
+        return liteModelBuffer;
+    }
+
    private PowerMode powerMode = PowerMode.LITE_POWER_HIGH;
    private int threads = 1;
+    private String liteModelFile;
+    private String liteModelBuffer;
 }
--- a/lite/api/apis_test.cc
+++ b/lite/api/apis_test.cc
@@ -62,7 +62,7 @@ TEST(CXXApi_LightApi, optim_model) {

 TEST(CXXApi_LightApi, save_and_load_model) {
  lite::Predictor cxx_api;
-  lite::LightPredictor light_api(FLAGS_optimized_model);
+  lite::LightPredictor light_api(FLAGS_optimized_model + ".nb", false);

  // CXXAPi
  {

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -116,7 +116,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
  lite_api::MobileConfig config;
  config.set_threads(FLAGS_threads);
  config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");

  auto predictor = lite_api::CreatePaddlePredictor(config);


--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -121,6 +121,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
            << kpf_path;
 }

+#ifndef LITE_WITH_FPGA
 lite::Tensor *Predictor::GetInput(size_t offset) {
  CHECK(input_names_.size() > offset)
      << "The network has " << input_names_.size() << " inputs"
@@ -130,6 +131,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
                << " in exec_scope";
  return in_var->GetMutable<lite::Tensor>();
 }
+#else
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = exec_scope_->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+#endif

 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
@@ -167,6 +179,8 @@ void Predictor::PrepareFeedFetch() {
  }
 }

+#ifndef LITE_WITH_FPGA
+
 const lite::Tensor *Predictor::GetOutput(size_t offset) const {
  CHECK(output_names_.size() > offset)
      << "The network has " << output_names_.size() << " outputs"
@@ -186,6 +200,29 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
  }
  return outputs;
 }
+#else
+
+const lite::Tensor *Predictor::GetOutput(size_t offset) const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+
+  std::vector<const lite::Tensor *> outputs;
+  for (auto out : fetch_list) {
+    outputs.push_back(&out);
+  }
+  return outputs;
+}
+
+#endif

 const cpp::ProgramDesc &Predictor::program_desc() const {
  return program_desc_;
@@ -239,7 +276,7 @@ void Predictor::Build(const std::string &model_path,
    case lite_api::LiteModelType::kNaiveBuffer:
      CHECK(!model_path.empty())
          << "NaiveBuffer backend only supported combined param";
-      LoadModelNaive(model_path, scope_.get(), &program_desc_);
+      LoadModelNaiveFromFile(model_path, scope_.get(), &program_desc_);
      break;
    default:
      LOG(FATAL) << "Unknown model type";

--- a/lite/api/cxx_api_test.cc
+++ b/lite/api/cxx_api_test.cc
@@ -101,7 +101,7 @@ TEST(CXXApi, save_model) {
 TEST(CXXApi, load_model_naive) {
  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kARM), PRECISION(kFloat)}});
-  predictor.Build(FLAGS_optimized_model + ".naive",
+  predictor.Build(FLAGS_optimized_model + ".naive.nb",
                  "",
                  "",
                  valid_places,

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -18,6 +18,17 @@
 namespace paddle {
 namespace lite {

+void LightPredictor::Build(const std::string& lite_model_file,
+                           bool model_from_memory) {
+  if (model_from_memory) {
+    LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
+  } else {
+    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
+  }
+  BuildRuntimeProgram(cpp_program_desc_);
+  PrepareFeedFetch();
+}
+
 void LightPredictor::Build(const std::string& model_dir,
                           const std::string& model_buffer,
                           const std::string& param_buffer,

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -18,6 +18,7 @@
 */
 #pragma once

+#include <algorithm>
 #include <map>
 #include <memory>
 #include <string>
@@ -39,12 +40,22 @@ namespace lite {
 */
 class LITE_API LightPredictor {
 public:
-  LightPredictor(
-      const std::string& model_dir,
-      const std::string& model_buffer = "",
-      const std::string& param_buffer = "",
-      bool model_from_memory = false,
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf) {
+  // constructor function of LightPredictor, `lite_model_file` refers to data in
+  // model file or buffer,`model_from_memory` refers to whther to load model
+  // from memory.
+  LightPredictor(const std::string& lite_model_file,
+                 bool model_from_memory = false) {
+    scope_ = std::make_shared<Scope>();
+    Build(lite_model_file, model_from_memory);
+  }
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
+  LightPredictor(const std::string& model_dir,
+                 const std::string& model_buffer = "",
+                 const std::string& param_buffer = "",
+                 bool model_from_memory = false,
+                 lite_api::LiteModelType model_type =
+                     lite_api::LiteModelType::kNaiveBuffer) {
    scope_ = std::make_shared<Scope>();
    Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
  }
@@ -69,6 +80,10 @@ class LITE_API LightPredictor {
  void PrepareFeedFetch();

 private:
+  void Build(const std::string& lite_model_file,
+             bool model_from_memory = false);
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
  void Build(
      const std::string& model_dir,
      const std::string& model_buffer,

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -23,13 +23,17 @@ namespace lite {

 void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  // LightPredictor Only support NaiveBuffer backend in publish lib
-  raw_predictor_.reset(
-      new LightPredictor(config.model_dir(),
-                         config.model_buffer(),
-                         config.param_buffer(),
-                         config.model_from_memory(),
-                         lite_api::LiteModelType::kNaiveBuffer));
-
+  if (config.lite_model_file().empty()) {
+    raw_predictor_.reset(
+        new LightPredictor(config.model_dir(),
+                           config.model_buffer(),
+                           config.param_buffer(),
+                           config.model_from_memory(),
+                           lite_api::LiteModelType::kNaiveBuffer));
+  } else {
+    raw_predictor_.reset(new LightPredictor(config.lite_model_file(),
+                                            config.model_from_memory()));
+  }
  mode_ = config.power_mode();
  threads_ = config.threads();
 }

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -73,7 +73,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
         const int repeat,
         const int warmup_times = 0) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);


--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #endif
 // "supported_kernel_op_info.h", "all_kernel_faked.cc" and "kernel_src_map.h"
-// are created automatically during model_optimize_tool's compiling period
+// are created automatically during opt's compiling period
 #include <iomanip>
 #include "all_kernel_faked.cc"  // NOLINT
 #include "kernel_src_map.h"     // NOLINT

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -190,5 +190,27 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }

+// set model data in combined format, `set_model_from_file` refers to loading
+// model from file, set_model_from_buffer refers to loading model from memory
+// buffer
+void MobileConfig::set_model_from_file(const std::string &x) {
+  lite_model_file_ = x;
+}
+void MobileConfig::set_model_from_buffer(const std::string &x) {
+  lite_model_file_ = x;
+  model_from_memory_ = true;
+}
+void MobileConfig::set_model_buffer(const char *model_buffer,
+                                    size_t model_buffer_size,
+                                    const char *param_buffer,
+                                    size_t param_buffer_size) {
+  LOG(WARNING) << "warning: `set_model_buffer` will be abandened in "
+                  "release/v3.0.0, new method `set_model_from_buffer(const "
+                  "std::string &x)` is recommended.";
+  model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
+  param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size);
+  model_from_memory_ = true;
+}
+
 }  // namespace lite_api
 }  // namespace paddle
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -168,22 +168,40 @@ class LITE_API CxxConfig : public ConfigBase {
 /// MobileConfig is the config for the light weight predictor, it will skip
 /// IR optimization or other unnecessary stages.
 class LITE_API MobileConfig : public ConfigBase {
+  // whether to load data from memory. Model data will be loaded from memory
+  // buffer if model_from_memory_ is true.
+  bool model_from_memory_{false};
+
+  // model data readed from file or memory buffer in combined format.
+  std::string lite_model_file_;
+
+  // NOTE: This is a deprecated variable and will be removed in latter release.
  std::string model_buffer_;
  std::string param_buffer_;
-  bool model_from_memory_{false};

 public:
+  // set model data in combined format, `set_model_from_file` refers to loading
+  // model from file, set_model_from_buffer refers to loading model from memory
+  // buffer
+  void set_model_from_file(const std::string& x);
+  void set_model_from_buffer(const std::string& x);
+  // return model data in lite_model_file_, which is in combined format.
+  const std::string& lite_model_file() const { return lite_model_file_; }
+
+  // return model_from_memory_, which indicates whether to load model from
+  // memory buffer.
+  bool model_from_memory() const { return model_from_memory_; }
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
  void set_model_buffer(const char* model_buffer,
                        size_t model_buffer_size,
                        const char* param_buffer,
-                        size_t param_buffer_size) {
-    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
-    param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size);
-    model_from_memory_ = true;
-  }
+                        size_t param_buffer_size);

-  bool model_from_memory() const { return model_from_memory_; }
+  // NOTE: This is a deprecated API and will be removed in latter release.
  const std::string& model_buffer() const { return model_buffer_; }
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
  const std::string& param_buffer() const { return param_buffer_; }
 };


--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -72,7 +72,7 @@ TEST(CxxApi, run) {
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(LightApi, run) {
  lite_api::MobileConfig config;
-  config.set_model_dir(FLAGS_model_dir + ".opt2.naive");
+  config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb");

  auto predictor = lite_api::CreatePaddlePredictor(config);

@@ -109,16 +109,11 @@ TEST(LightApi, run) {
 // Demo2 for Loading model from memory
 TEST(MobileConfig, LoadfromMemory) {
  // Get naive buffer
-  auto model_path = std::string(FLAGS_model_dir) + ".opt2.naive/__model__.nb";
-  auto params_path = std::string(FLAGS_model_dir) + ".opt2.naive/param.nb";
-  std::string model_buffer = lite::ReadFile(model_path);
-  size_t size_model = model_buffer.length();
-  std::string params_buffer = lite::ReadFile(params_path);
-  size_t size_params = params_buffer.length();
+  auto model_file = std::string(FLAGS_model_dir) + ".opt2.naive.nb";
+  std::string model_buffer = lite::ReadFile(model_file);
  // set model buffer and run model
  lite_api::MobileConfig config;
-  config.set_model_buffer(
-      model_buffer.c_str(), size_model, params_buffer.c_str(), size_params);
+  config.set_model_from_buffer(model_buffer);

  auto predictor = lite_api::CreatePaddlePredictor(config);
  auto input_tensor = predictor->GetInput(0);

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -41,6 +41,7 @@ USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(kernel_place_correct_pass)
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);

--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -116,6 +116,8 @@ void BindLiteMobileConfig(py::module *m) {
  py::class_<MobileConfig> mobile_config(*m, "MobileConfig");

  mobile_config.def(py::init<>())
+      .def("set_model_from_file", &MobileConfig::set_model_from_file)
+      .def("set_model_from_buffer", &MobileConfig::set_model_from_buffer)
      .def("set_model_dir", &MobileConfig::set_model_dir)
      .def("model_dir", &MobileConfig::model_dir)
      .def("set_model_buffer", &MobileConfig::set_model_buffer)

--- a/lite/api/resnet50_test_fpga.cc
+++ b/lite/api/resnet50_test_fpga.cc
@@ -31,11 +31,7 @@ TEST(ResNet50, test) {
  std::vector<Place> valid_places(
      {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});

-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
-                  valid_places);
+  predictor.Build(FLAGS_model_dir, "", "", valid_places);

  auto* input_tensor = predictor.GetInput(0);
  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));

--- a/lite/api/test_ssd_fpga.cc
+++ b/lite/api/test_ssd_fpga.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <dirent.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(input_file, "", "input_file");
+
+namespace paddle {
+namespace lite {
+
+std::vector<std::string> GetDirectoryFiles(const std::string& dir) {
+  std::vector<std::string> files;
+  std::shared_ptr<DIR> directory_ptr(opendir(dir.c_str()),
+                                     [](DIR* dir) { dir&& closedir(dir); });
+  struct dirent* dirent_ptr;
+  if (!directory_ptr) {
+    std::cout << "Error opening : " << std::strerror(errno) << dir << std::endl;
+    return files;
+  }
+
+  while ((dirent_ptr = readdir(directory_ptr.get())) != nullptr) {
+    files.push_back(std::string(dirent_ptr->d_name));
+  }
+  return files;
+}
+
+void readFromFile(int num, std::string path, float* data) {
+  std::ifstream file_stream(path);
+  // file_stream.open(path);
+  if (!file_stream.good()) {
+    std::cout << "file: " << path << " dones not exist!\n";
+    exit(-1);
+    return;
+  }
+  // float* data = mutableData<float>();
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+  file_stream.close();
+}
+
+// #ifdef LITE_WITH_FPGA
+TEST(ResNet50, test) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({
+      Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+
+  // predictor.Build(FLAGS_model_dir, "", "", valid_places);
+  predictor.Build("",
+                  FLAGS_model_dir + "/model",
+                  FLAGS_model_dir + "/params",
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  int width = 300;
+  int height = 300;
+
+  // std::ifstream file_stream(FLAGS_input_file);
+  // if (!file_stream.good()) {
+  //   std::cout << "file: " << FLAGS_input_file << " dones not exist!\n";
+  //   exit(-1);
+  //   return;
+  // }
+
+  // file_stream >> height;
+  // file_stream >> width;
+
+  input_tensor->Resize(
+      DDim(std::vector<DDim::value_type>({1, 3, height, width})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  // readFromFile(item_size, "car.data", data);
+
+  int num = 3 * width * height;
+
+  // for (int i = 0; i < num; ++i) {
+  //   float value = 0;
+  //   file_stream >> value;
+  //   data[i] = value;
+  // }
+  // file_stream.close();
+
+  for (int i = 0; i < 2; ++i) {
+    predictor.Run();
+  }
+
+  auto* out = predictor.GetOutput(0);
+  for (int i = 0; i < out->dims().production(); i++) {
+    std::cout << ":" << out->data<float>()[i] << std::endl;
+  }
+
+  std::string file = "output/" + FLAGS_input_file.substr(6);
+  std::cout << "file:::" << file << std::endl;
+
+  std::ofstream ofs;
+  ofs.open(file);
+  for (int i = 0; i < out->dims().production(); i++) {
+    float value = out->data<float>()[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+
+  LOG(INFO) << "================== Speed Report ===================";
+}
+// #endif
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
@@ -109,7 +109,7 @@ void conv_depthwise_5x5s1_fp32(float* dout,
            tmp_din + omp_get_thread_num() * (pre_in_size + pre_out_size);
        float* pre_out = pre_din + pre_in_size;
 #else
-        float pre_din = tmp_din;
+        float* pre_din = tmp_din;
        float* pre_out = pre_din + pre_in_size;
 #endif
        prepack_input_nxwc4_dw(

--- a/lite/backends/arm/math/type_trans.cc
+++ b/lite/backends/arm/math/type_trans.cc
@@ -46,6 +46,7 @@ void fp32_to_int8(const float* din,
    float inv_scale = 1.f / scale[j % axis_size];
    float32x4_t vzero = vdupq_n_f32(0.f);
    float32x4_t vscale = vdupq_n_f32(inv_scale);
+    float32x4_t vmax = vdupq_n_f32(-127.f);
    float32x4_t vpoff = vdupq_n_f32(0.5f);
    float32x4_t vnoff = vdupq_n_f32(-0.5f);
    const float* din_c = din + j * inner_size;
@@ -63,6 +64,14 @@ void fp32_to_int8(const float* din,
          "fmul v5.4s, v1.4s, %[scale].4s             \n"
          "fmul v6.4s, v2.4s, %[scale].4s             \n"
          "fmul v7.4s, v3.4s, %[scale].4s             \n"
+          "fcmge v8.4s, v4.4s, %[vmax].4s             \n"
+          "fcmge v9.4s, v5.4s, %[vmax].4s             \n"
+          "fcmge v10.4s, v6.4s, %[vmax].4s            \n"
+          "fcmge v11.4s, v7.4s, %[vmax].4s            \n"
+          "bif v4.16b, %[vmax].16b, v8.16b            \n"
+          "bif v5.16b, %[vmax].16b, v9.16b            \n"
+          "bif v6.16b, %[vmax].16b, v10.16b            \n"
+          "bif v7.16b, %[vmax].16b, v11.16b            \n"
          "ldp q0, q1, [%[in]], #32                   \n"
          "subs %[cnt], %[cnt], #1                    \n"
          "FCVTAS v8.4s, v4.4s                        \n"
@@ -79,7 +88,7 @@ void fp32_to_int8(const float* din,
          "str q8, [%[out]], #16                      \n"
          "bne    0b                                  \n"
          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
-          : [scale] "w"(vscale)
+          : [scale] "w"(vscale), [vmax] "w"(vmax)
          : "v0",
            "v1",
            "v2",
@@ -104,15 +113,23 @@ void fp32_to_int8(const float* din,
          "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
          "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
          "vcgt.f32   q10, q2, %q[vzero]          @ get mask > 0, in2\n"
-          "vcgt.f32   q11, q3, %q[vzero]          @ get mask > 0, in3\n"
          "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
+          "vcgt.f32   q8, q3, %q[vzero]          @ get mask > 0, in3\n"
          "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
          "vbif.f32   q6, %q[vnoff], q10          @ get right offset\n"
-          "vbif.f32   q7, %q[vnoff], q11          @ get right offset\n"
+          "vbif.f32   q7, %q[vnoff], q8          @ get right offset\n"
          "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
          "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
          "vmla.f32   q6, q2, %q[vscale]          @ mul scale\n"
          "vmla.f32   q7, q3, %q[vscale]          @ mul scale\n"
+          "vcge.f32 q8, q4, %q[vmax]              @ q4 >= vmax \n"
+          "vcge.f32 q9, q5, %q[vmax]              @ q4 >= vmax \n"
+          "vcge.f32 q10, q6, %q[vmax]             @ q4 >= vmax \n"
+          "vbif q4, %q[vmax], q8                  @ choose \n"
+          "vcge.f32 q8, q7, %q[vmax]             @ q4 >= vmax \n"
+          "vbif q5, %q[vmax], q9                  @ choose \n"
+          "vbif q6, %q[vmax], q10                  @ choose \n"
+          "vbif q7, %q[vmax], q8                  @ choose \n"
          "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
          "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
          "vcvt.s32.f32  q2, q6                   @ cvt to int32\n"
@@ -133,25 +150,16 @@ void fp32_to_int8(const float* din,
          : [vscale] "w"(vscale),
            [vpoff] "w"(vpoff),
            [vnoff] "w"(vnoff),
-            [vzero] "w"(vzero)
-          : "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11");
+            [vzero] "w"(vzero),
+            [vmax] "w"(vmax)
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10");
 #endif
    }
    const float* din_r = din_c + 16 * cnt;
    signed char* dout_r = dout_c + 16 * cnt;
    for (int i = 0; i < remain; ++i) {
      dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
+      dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i];
    }
  }
 }

--- a/lite/backends/fpga/KD/fpga_cv.cpp
+++ b/lite/backends/fpga/KD/fpga_cv.cpp
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/fpga/KD/fpga_cv.hpp"
-
-using paddle::zynqmp::float16;
-
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height) {
-  paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
-  paddle::zynqmp::config_inplace(inplace_args);
-
-  paddle::zynqmp::ImageInputArgs input_args = {nullptr};
-  input_args.address = nullptr;
-  input_args.scale_address = nullptr;
-
-  float16* input_image_address =
-      reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
-          input_width * input_height * input_channel * sizeof(float16)));
-  int index = 0;
-
-  for (int i = 0; i < input_width * input_height * input_channel; i++) {
-    input_image_address[i] = float16(1.0 * input[i]);
-  }
-
-  paddle::zynqmp::ResizeArgs resize_args = {0};
-
-  resize_args.input_width = input_width;
-  resize_args.input_height = input_height;
-  resize_args.image_channel = input_channel;
-  resize_args.output_width = output_width;
-  resize_args.output_height = output_height;
-  float height_ratio = static_cast<float>(input_height) /
-                       static_cast<float>(resize_args.output_height);
-  float width_ratio = static_cast<float>(input_width) /
-                      static_cast<float>(resize_args.output_width);
-  resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
-  resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
-
-  int output_size =
-      resize_args.output_width * resize_args.output_height * input_channel;
-  float16* fpga_output = reinterpret_cast<float16*>(
-      paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
-  resize_args.input_image_address = input_image_address;
-  resize_args.output_image_address = fpga_output;
-
-  memset(fpga_output, 0, output_size * sizeof(float16));
-  paddle::zynqmp::fpga_flush(
-      input_image_address,
-      input_width * input_height * input_channel * sizeof(float16));
-  paddle::zynqmp::fpga_flush(resize_args.output_image_address,
-                             output_size * sizeof(float16));
-  int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
-  if (ret == 0) {
-    paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
-                                    output_size * sizeof(float16));
-  }
-
-  for (int i = 0; i < output_size; i++) {
-    output[i] = fpga_output[i];
-  }
-}
--- a/lite/backends/fpga/KD/fpga_cv.hpp
+++ b/lite/backends/fpga/KD/fpga_cv.hpp
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-#include "lite/backends/fpga/KD/pe.hpp"
-
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height);
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -151,6 +151,10 @@ class TensorLite {
  size_t offset() const { return offset_; }

  bool IsInitialized() const { return buffer_->data(); }
+  void clear() {
+    buffer_->Free();
+    offset_ = 0;
+  }

  // Other share data to this.
  void ShareDataWith(const TensorLite &other);

--- a/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data, 
+    int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) {
+  const int index = get_global_id(0); 
+  if (index < size){
+    for (int i = 0; i < pre_size; i++){
+        int offset_out = index * post_size + i * total;
+        int offset_in = index * post_size + i * total0;
+        // memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
+        CL_DTYPE* dst = out_data + offset_out;
+        CL_DTYPE* src = x_data0 + offset_in;
+        for (int k = 0; k < post_size; k++){
+           *dst++ = *src++;
+	}
+    }
+  }else if (index < axis_size){
+    for (int i = 0; i < pre_size; i++){
+        int offset_out = index * post_size + i * total;
+        int offset_in = index * post_size + i * total1;
+        // memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
+        CL_DTYPE* dst = out_data + offset_out;
+        CL_DTYPE* src = x_data1 + offset_in;
+        for (int k = 0; k < post_size; k++){
+           *dst++ = *src++;
+        }
+    }
+  }
+}
+
+__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data, 
+    int axis_size, int pre_size, int post_size, int start, int total, int total0) {
+  const int index = get_global_id(0); 
+  if (index < axis_size){
+    for (int i = 0; i < pre_size; i++){
+        int offset_out = (start + index) * post_size + i * total;
+        int offset_in = index * post_size + i * total0;
+        // memcpy(out_data + offset_out, x_data + offset_in, post_size);
+        CL_DTYPE* dst = out_data + offset_out;
+        CL_DTYPE* src = x_data + offset_in;
+        for (int k = 0; k < post_size; k++){
+           *dst++ = *src++;
+        }
+    }
+  }
+}
--- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void concat2(__read_only image2d_t input0,
+                    __read_only image2d_t input1,
+                    __write_only image2d_t output,
+                    int axis_size, int flag, int width) {
+  const int x = get_global_id(0); // image_width cxw/4
+  const int y = get_global_id(1); // image_height nxh
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  int xx = x / width;
+  if (flag == 0){
+    xx = y / width;
+  }
+  if (xx < axis_size){
+    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y));
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  }else{
+    int new_val = xx - axis_size;
+    new_val *= width;
+    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y));
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  }
+  // WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
+
+__kernel void concat_mul(__read_only image2d_t input0,
+                    __write_only image2d_t output,
+                    int axis_size, int flag, int width, int start) {
+  const int x = get_global_id(0); // image_width cxw/4
+  const int y = get_global_id(1); // image_height nxh
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  int xx = x / width;
+  if (flag == 0){
+    xx = y / width;
+  }
+  
+  if (xx < axis_size && xx >= start){
+    xx -= start;
+   xx *= width;
+    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y));
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  }
+  
+}
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
--- a/lite/backends/fpga/KD/llapi/config.h
+++ b/lite/backends/fpga/KD/llapi/config.h
@@ -12,8 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#include <cl_common.h>

-#define PADDLE_LITE_ZU5
-#define FPGA_PRINT_MODE
-#define PADDLE_LITE_PROFILE
+__kernel void scale(__read_only image2d_t input,
+                    __write_only image2d_t output,
+                    __private float scale,
+                    __private float bias){
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  in = convert_float(scale) * in + convert_float(bias);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -110,11 +110,11 @@ void set_constant(const lite::Context<Target>& context,
                  lite::Tensor* tensor,
                  float value) {
  TensorSetConstantWithTarget<Target> func(context, tensor, value);
-  //#ifdef PADDLE_WITH_CUDA
+  // #ifdef PADDLE_WITH_CUDA
  // tensor->target().apply_visitor(func);
-  //#else
+  // #else
  func();
-  //#endif
+  // #endif
 }

 template <typename T>
@@ -128,12 +128,14 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(vector.numel(), size);
    PADDLE_ENFORCE_EQ(output->dims(), in_dims);

-    auto in = lite::fluid::EigenMatrix<T>::From(input);
-    auto vec = lite::fluid::EigenVector<T>::Flatten(vector);
-    auto out = lite::fluid::EigenMatrix<T>::From(*output);
-
+    const T* input_data = input.data<T>();
+    const T* vector_data = vector.data<T>();
+    T* output_data = output->mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
+      for (int64_t j = 0; j < size; ++j) {
+        output_data[i * in_dims[0] + j] =
+            input_data[i * in_dims[0] + j] + vector_data[j];
+      }
    }
  }
 };

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -25,6 +25,7 @@ lite_cc_library(mir_passes
      elimination/elementwise_mul_constant_eliminate_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc
+      kernel_place_correct_pass.cc
      type_target_cast_pass.cc
      type_layout_cast_pass.cc
      type_precision_cast_pass.cc

--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -29,6 +29,11 @@ void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
      act_types.push_back("leaky_relu");
      break;
    }
+    if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
+      act_types.push_back("relu6");
+      act_types.push_back("leaky_relu");
+      break;
+    }
  }
  for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
    for (auto act_type : act_types) {

--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -27,10 +27,24 @@ namespace mir {
 void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  // delete quant node
  std::vector<std::string> quant_op_types = {
-      "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
+      "fake_quantize_abs_max",
+      "fake_quantize_range_abs_max",
+      "fake_quantize_moving_average_abs_max"};
+  /*
+  for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) {
+    for (int i = 5; i >= 1; --i){
+      fusion::DynamicQuantDequantOpFuser fuser("fake_quantize_abs_max", op_type,
+  i);
+      fuser(graph.get());
+    }
+  }
+  */
+
  for (auto& op_type : quant_op_types) {
    fusion::DeleteQuantOpFuser fuser(op_type);
    fuser(graph.get());
+    fusion::DeleteDynamicQuantOpFuser dfuser(op_type);
+    dfuser(graph.get());
  }

  // fuse quantized node and dequant node

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -77,6 +77,55 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
  return op_desc;
 }

+void DeleteDynamicQuantOpFuser::BuildPattern() {
+  auto* input_act_node =
+      VarNode("input_act_node")->assert_is_op_input(quant_op_type_, "X");
+  auto* quant_node =
+      OpNode("quant_node", quant_op_type_)->assert_is_op(quant_op_type_);
+  auto* output_scale_node =
+      VarNode("output_scale_node")
+          ->assert_is_op_output(quant_op_type_, "OutScale");
+  auto* output_act_node =
+      VarNode("output_act_node")->assert_is_op_output(quant_op_type_, "Out");
+
+  quant_node->LinksFrom({input_act_node});
+  output_scale_node->LinksFrom({quant_node});
+  output_act_node->LinksFrom({quant_node});
+  VLOG(4) << "DeleteQuantOpFuser BuildPattern quant_op_type:" << quant_op_type_;
+}
+
+void DeleteDynamicQuantOpFuser::InsertNewNode(SSAGraph* graph,
+                                              const key2nodes_t& matched) {
+  auto* input_act_node = matched.at("input_act_node");
+  auto* quant_node = matched.at("quant_node");
+  auto* output_scale_node = matched.at("output_scale_node");
+  auto* output_act_node = matched.at("output_act_node");
+
+  // obtain values, save values and relink node
+  int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
+  int range = ((1 << (bit_length - 1)) - 1);
+  auto* scope = quant_node->stmt()->op()->scope();
+  auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+  float scale_value = scale_tensor->data<float>()[0] / range;
+
+  auto outlinks = output_act_node->outlinks;
+  for (auto* quantized_node : outlinks) {
+    auto* op_desc = quantized_node->stmt()->mutable_op_info();
+    op_desc->SetAttr<int>("bit_length", bit_length);
+    IR_NODE_LINK_TO(input_act_node, quantized_node)
+  }
+
+  // delete nodes and edges
+  std::unordered_set<const Node*> nodes2rm = {
+      quant_node, output_scale_node, output_act_node};
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+
+cpp::OpDesc DeleteDynamicQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc;
+  return op_desc;
+}
 void DequantOpFuser::BuildPattern() {
  std::string weight_name = "";
  if (quantized_op_type_ == "conv2d" ||
@@ -130,8 +179,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
  auto& valid_places = quantized_op->stmt()->op()->valid_places();
  int bit_length = quantized_op->stmt()->op_info()->GetAttr<int>("bit_length");
  int range = ((1 << (bit_length - 1)) - 1);
-  float input_scale =
-      quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
+  float input_scale = 0;
+  if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) {
+    input_scale =
+        quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
+  }
  float max_range = dequant_op->stmt()->op_info()->GetAttr<float>("max_range");
  float whole_weight_scale =
      static_cast<float>(range * range) / max_range / range;
@@ -162,8 +214,12 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
  for (int i = 0; i < weight_scale_size; i++) {
    weight_scale.push_back(whole_weight_scale);
  }
+#ifndef LITE_WITH_FPGA
  op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("input_scale", input_scale);
+#endif
+  if (quantized_op->stmt()->op_info()->HasAttr("input_scale")) {
+    op_desc.SetAttr("input_scale", input_scale);
+  }
  op_desc.SetAttr("weight_scale", weight_scale);

  // change the weight from the float type to int8 type.
@@ -171,12 +227,29 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
  temp_tensor.CopyDataFrom(*quantized_weight_t);
  float* temp_data = temp_tensor.mutable_data<float>();
  size_t weight_num = quantized_weight_t->data_size();
+
+#ifdef LITE_WITH_FPGA
+  float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
+  for (size_t i = 0; i < weight_num; i++) {
+    quantized_weight_data[i] = temp_data[i] * whole_weight_scale;
+  }
+  quantized_weight_t->set_persistable(true);
+  quantized_weight_t->set_precision(PRECISION(kFloat));
+#else
  int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
  for (size_t i = 0; i < weight_num; i++) {
    quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
  }
  quantized_weight_t->set_persistable(true);
  quantized_weight_t->set_precision(PRECISION(kInt8));
+#endif
+
+  // int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
+  // for (size_t i = 0; i < weight_num; i++) {
+  //   quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
+  // }
+  // quantized_weight_t->set_persistable(true);
+  // quantized_weight_t->set_precision(PRECISION(kInt8));

  // new op and relink nodes
  auto new_quantized_op = LiteOpRegistry::Global().Create(quantized_op_type_);
@@ -464,6 +537,197 @@ cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
  cpp::OpDesc op_desc;
  return op_desc;
 }
+//  ================dynamic quant fuse==============
+// #define DYNAMIC_RANGE
+void DynamicQuantDequantOpFuser::BuildPattern() {
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kQuantizedOpOutOffset = 2;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+
+  std::string weight_name = "";
+  if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
+    weight_name = "Filter";
+  } else {
+    weight_name = "Y";
+  }
+  auto* quant_op_input = VarNode("quant_op_input")
+                             ->assert_is_op_input(quant_type_, "X")
+                             ->AsInput();
+#ifdef DYNAMIC_RANGE
+  auto* quant_op_in_scale = VarNode("quant_op_in_scale")
+                                ->assert_is_op_input(quant_type_, "InScale")
+                                ->AsIntermediate();
+#endif
+  auto* quant_op = OpNode("quant_op", quant_type_)
+                       ->assert_is_op(quant_type_)
+                       ->AsIntermediate();
+
+  auto* quant_op_out_scale =
+      VarNode("quant_op_out_scale")
+          ->assert_is_op_output(quant_type_, "OutScale")
+          ->assert_is_op_input("fake_dequantize_max_abs", "Scale")
+          ->AsIntermediate();
+
+  auto* quant_op_out = VarNode("quant_op_out")
+                           ->assert_is_op_output(quant_type_, "Out")
+                           ->assert_is_op_input(op_type_)
+                           ->AsIntermediate();
+  std::vector<PMNode*> nodes;
+  for (int i = 0; i < times_; i++) {
+    nodes.push_back(VarNode(string_format("quantized_op_weight%d", i))
+                        ->assert_is_op_input(op_type_, weight_name)
+                        ->AsInput());
+
+    nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_)
+                        ->assert_is_op(op_type_)
+                        ->AsIntermediate());
+
+    nodes.push_back(VarNode(string_format("quantized_op_out%d", i))
+                        ->assert_is_op_output(op_type_)
+                        ->assert_is_op_input("fake_dequantize_max_abs", "X")
+                        ->AsIntermediate());
+
+    nodes.push_back(
+        OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs")
+            ->assert_is_op("fake_dequantize_max_abs")
+            ->AsIntermediate());
+    nodes.push_back(VarNode(string_format("dequant_op_out%d", i))
+                        ->assert_is_op_output("fake_dequantize_max_abs", "Out")
+                        ->AsOutput());
+  }
+
+#ifdef DYNAMIC_RANGE
+  quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
+#endif
+  quant_op->LinksFrom({quant_op_input});
+  quant_op_out->LinksFrom({quant_op});
+  quant_op_out_scale->LinksFrom({quant_op});
+  for (int i = 0; i < times_; i++) {
+    nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
+        {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
+    nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOffset]});
+    nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
+    nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kDequantOpOffset]});
+  }
+}
+
+void DynamicQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
+                                               const key2nodes_t& matched) {
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+
+  auto* quant_op_input = matched.at("quant_op_input");
+#ifdef DYNAMIC_RANGE
+  auto* quant_op_in_scale = matched.at("quant_op_in_scale");
+#endif
+  auto* quant_op = matched.at("quant_op");
+
+  std::vector<Node*> nodes;
+  for (int i = 0; i < times_; i++) {
+    nodes.push_back(matched.at(string_format("quantized_op_weight%d", i)));
+    nodes.push_back(matched.at(string_format("quantized_op%d", i)));
+    nodes.push_back(matched.at(string_format("quantized_op_out%d", i)));
+    nodes.push_back(matched.at(string_format("dequant_op%d", i)));
+    nodes.push_back(matched.at(string_format("dequant_op_out%d", i)));
+  }
+  int bit_length = quant_op->stmt()->op_info()->GetAttr<int>("bit_length");
+  auto* scope = quant_op->stmt()->op()->scope();
+  auto& valid_places = quant_op->stmt()->op()->valid_places();
+  int range = ((1 << (bit_length - 1)) - 1);
+
+#ifdef DYNAMIC_RANGE
+  auto input_scale_t = scope->FindVar(quant_op_in_scale->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+  float input_scale = input_scale_t->data<float>()[0] / range;
+  VLOG(4) << "range: " << range << " input_scale: " << input_scale;
+#endif
+  for (int i = 0; i < times_; i++) {
+    float max_range = nodes[i * kNumFields + kDequantOpOffset]
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<float>("max_range");
+    // weight_scale = max(abs(weight))
+    float whole_weight_scale =
+        static_cast<float>(range * range) / max_range / range;
+
+    cpp::OpDesc op_desc =
+        *nodes[i * kNumFields + kQuantizedOpOffset]->stmt()->op_info();
+
+    auto quantized_weight_var_name =
+        nodes[i * kNumFields + kQuantizedWeightOffset]->arg()->name;
+    auto quantized_weight_t =
+        scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
+    std::vector<float> weight_scale;
+    int weight_scale_size;
+
+    if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
+      op_desc.SetInput("Input", {matched.at("quant_op_input")->arg()->name});
+      op_desc.SetOutput(
+          "Output", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
+      // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
+      // be Cout.
+      weight_scale_size = quantized_weight_t->dims()[0];
+    } else if (op_type_ == "mul") {
+      op_desc.SetInput("X", {matched.at("quant_op_input")->arg()->name});
+      op_desc.SetOutput(
+          "Out", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
+      // Fc weight: Cin * Cout, the weight_scale_size should be Cout.
+      weight_scale_size = quantized_weight_t->dims()[1];
+    }
+    for (int i = 0; i < weight_scale_size; i++) {
+      weight_scale.push_back(whole_weight_scale);
+    }
+    // op_desc.SetAttr("enable_int8", true);
+    // op_desc.SetAttr("input_scale", input_scale);
+    op_desc.SetAttr("weight_scale", weight_scale);
+
+    Tensor temp_tensor;
+    temp_tensor.CopyDataFrom(*quantized_weight_t);
+    float* temp_data = temp_tensor.mutable_data<float>();
+    size_t weight_num = quantized_weight_t->data_size();
+    quantized_weight_t->set_persistable(true);
+    std::cout << "DynamicQuantDequantOpFuser::InsertNewNode===================="
+                 "========================================"
+              << std::endl;
+#ifdef LITE_WITH_FPGA
+    float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
+    for (size_t i = 0; i < weight_num; i++) {
+      quantized_weight_data[i] = temp_data[i] * whole_weight_scale;
+      std::cout << whole_weight_scale << "," << temp_data[i] << ","
+                << quantized_weight_data[i] << std::endl;
+    }
+    quantized_weight_t->set_precision(PRECISION(kFloat));
+#else
+    int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
+    for (size_t i = 0; i < weight_num; i++) {
+      quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
+    }
+    quantized_weight_t->set_precision(PRECISION(kInt8));
+#endif
+    auto quantized_op = LiteOpRegistry::Global().Create(op_type_);
+    quantized_op->Attach(op_desc, scope);
+    auto* new_op_node =
+        graph->GraphCreateInstructNode(quantized_op, valid_places);
+    IR_NODE_LINK_TO(quant_op_input, new_op_node);
+    IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset],
+                    new_op_node);
+    IR_NODE_LINK_TO(new_op_node, nodes[i * kNumFields + kDequantOpOutOffset]);
+  }
+}
+
+cpp::OpDesc DynamicQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc;
+  return op_desc;
+}

 }  // namespace fusion
 }  // namespace mir

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -52,6 +52,19 @@ class DeleteQuantOpFuser : public FuseBase {
 private:
  std::string quant_op_type_{};
 };
+class DeleteDynamicQuantOpFuser : public FuseBase {
+ public:
+  explicit DeleteDynamicQuantOpFuser(const std::string& quant_op_type)
+      : quant_op_type_(quant_op_type) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+
+ private:
+  std::string quant_op_type_{};
+};

 /* DequantOpFuser process conv2d/depthwise_conv2d/mul + fake_dequantize_max_abs.
 */
@@ -106,6 +119,24 @@ class DeleteQuantDequantOpFuser : public FuseBase {
 private:
  std::string quantized_op_type_{};
 };
+// dynamic quantdequant op fuser
+class DynamicQuantDequantOpFuser : public FuseBase {
+ public:
+  explicit DynamicQuantDequantOpFuser(const std::string& quantized_op_type,
+                                      const std::string& op_type,
+                                      int i)
+      : op_type_(op_type), quant_type_(quantized_op_type), times_(i) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+
+ private:
+  std::string op_type_{};
+  std::string quant_type_{};
+  int times_{1};
+};

 }  // namespace fusion
 }  // namespace mir

--- a/lite/core/mir/kernel_place_correct_pass.cc
+++ b/lite/core/mir/kernel_place_correct_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/kernel_place_correct_pass.h"
+#include <memory>
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void KernelPlaceCorrectPass::Apply(const std::unique_ptr<SSAGraph> &graph) {
+  CorrectArgumentPlace(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(kernel_place_correct_pass,
+                  paddle::lite::mir::KernelPlaceCorrectPass)
+    .BindTargets({TARGET(kFPGA)});
--- a/lite/core/mir/kernel_place_correct_pass.h
+++ b/lite/core/mir/kernel_place_correct_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+/*
+ * Correct the place of the variables in the SSAGrpah, it will inference the
+ * variables' place by the kernels outputs them.
+ */
+class KernelPlaceCorrectPass : public DebugPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+ private:
+  void CorrectArgumentPlace(SSAGraph* graph) {
+    auto& valid_places = graph->valid_places();
+    auto valid_places_has_target = [&](TargetType t) -> bool {
+      for (auto& p : valid_places) {
+        if (p.target == t) {
+          return true;
+        }
+      }
+      return false;
+    };
+    std::map<std::string, bool> lite_with_targets{
+        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
+        {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
+    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
+    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
+
+    VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
+    for (auto& x : graph->StmtTopologicalOrder()) {
+      auto& inst = x->AsStmt();
+      // The IoCopyOp is a tool operator, it won't support the type inference.
+      // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
+      // for
+      // tool operator
+      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
+        VLOG(3) << "inst.op_type() == 'io_copy', continue";
+        if (inst.op_type() == "io_copy") continue;
+      }
+      // deal with inputs
+      VLOG(4) << "checking op " << inst.op_info()->Repr();
+
+      auto get_argname = [&](
+          const std::string& node_name,
+          const std::map<std::string, std::vector<std::string>>& argname_map)
+          -> std::string {
+            for (auto& ele : argname_map) {
+              auto it =
+                  std::find(ele.second.begin(), ele.second.end(), node_name);
+              if (it != ele.second.end()) return ele.first;
+            }
+            return "";
+          };
+
+      bool need_correct_place = true;
+
+      std::vector<TargetType> in_types;
+      std::vector<TargetType> out_types;
+      for (auto* x_in : x->inlinks) {
+        std::string node_name = x_in->AsArg().name;
+        std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
+        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
+                                   << node_name;
+        VLOG(4) << "-- input arg_name:" << arg_name << " "
+                << "-- node name:" << node_name;
+        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
+        if (!x_in->AsArg().type) {
+          need_correct_place &= false;
+        } else {
+          if (in_types.empty()) {
+            in_types.push_back(x_in->AsArg().type->target());
+          } else {
+            if (in_types[0] != x_in->AsArg().type->target()) {
+              need_correct_place &= false;
+            }
+          }
+        }
+      }
+
+      for (auto* x_out : x->outlinks) {
+        std::string node_name = x_out->AsArg().name;
+        std::string arg_name =
+            get_argname(node_name, inst.op_info()->outputs());
+        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
+                                   << node_name << " in Inst "
+                                   << inst.op_type();
+        VLOG(4) << "-- output arg_name " << arg_name;
+        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
+        if (!x_out->AsArg().type) {
+          need_correct_place &= false;
+        } else {
+          if (out_types.empty()) {
+            out_types.push_back(x_out->AsArg().type->target());
+          } else {
+            if (out_types[0] != x_out->AsArg().type->target()) {
+              need_correct_place &= false;
+            }
+          }
+        }
+      }
+
+      auto this_type = inst.picked_kernel().target();
+      bool io_target_same = (in_types[0] == out_types[0]);
+      need_correct_place &= (io_target_same && (in_types[0] != this_type));
+      if (need_correct_place) {
+        // update this kernel's valid place;
+        UpdateTarget(inst, in_types[0]);
+      }
+    }
+  }
+
+  // Update me's kUnk fields by other's fields.
+  void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) {  // NOLINT
+    auto new_place = inst.place();
+    new_place.target = new_target;
+    std::vector<Place> places;
+    places.push_back(new_place);
+    inst.ResetKernels(places);
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/node.cc
+++ b/lite/core/mir/node.cc
@@ -53,6 +53,11 @@ void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
  }
  valid_kernels_ = op_->CreateKernels(valid_places);
 }
+void mir::Node::Stmt::ResetKernels(const std::vector<Place> &valid_places) {
+  CHECK(op_) << "change valid place failed, not created op";
+  valid_kernels_.clear();
+  valid_kernels_ = op_->CreateKernels(valid_places);
+}

 mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
  auto &x = AsArg();

--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -53,6 +53,7 @@ class Node {
                 const std::vector<Place>& valid_places,
                 lite::Scope* scope = nullptr);

+    void ResetKernels(const std::vector<Place>& valid_places);
    std::string op_type() const { return op_info()->Type(); }
    const OpInfo* op_info() const;
    OpInfo* mutable_op_info();

--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -140,10 +140,12 @@ void SSAGraph::Build(const Program &program,
        arg_node->AsArg(name, node_storage_.size() - 1);
        arg_update_node_map_[name] = arg_node;
      }
+      /*
      if (var_types.count(name) && !arg_node->arg()->type) {
        arg_node->arg()->type = LiteType::GetTensorTy(
            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
      }
+      */
      if (is_weights(name)) arg_node->AsArg().is_weight = true;
      CHECK(arg_node->IsRoleSet());
      DirectedLink(arg_node, op_node);
@@ -153,10 +155,12 @@ void SSAGraph::Build(const Program &program,
      auto *arg_node = &node_storage_.back();
      arg_node->AsArg(name, node_storage_.size() - 1);
      arg_update_node_map_[name] = arg_node;
+      /*
      if (var_types.count(name) && !arg_node->arg()->type) {
        arg_node->arg()->type = LiteType::GetTensorTy(
            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
      }
+      */

      if (is_weights(name)) arg_node->AsArg().is_weight = true;
      CHECK(arg_node->IsRoleSet());

--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -157,7 +157,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
                                lite_api::LiteModelType::kNaiveBuffer);
  // Load optimized model
  lite_api::MobileConfig mobile_config;
-  mobile_config.set_model_dir(optimized_model_dir);
+  mobile_config.set_model_from_file(optimized_model_dir + ".nb");
  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
  mobile_config.set_threads(1);
  predictor = lite_api::CreatePaddlePredictor(mobile_config);

--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -101,7 +101,6 @@ void TypeTargetTransformPass::AddIoCopyInst(
  auto io_copy_output_name =
      string_format("%s/target_trans", in->AsArg().name.c_str());
  // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
-
  if (copied_nodes->count(in->AsArg().name)) {
    // Remove the old link
    RemoveDirectedLink(in, inst_node);
@@ -116,12 +115,14 @@ void TypeTargetTransformPass::AddIoCopyInst(
  } else {
    // TODO(MyPandaShaoxiang) should set same place with input?
    auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
-    // Set the place for io_copy_output_arg node, the target should be equal to
-    // to.target()
-    // The precision and layout should be equal to from.precision(),
-    // from.layout()
+// Set the place for io_copy_output_arg node, the target should be equal to
+// to.target()
+// The precision and layout should be equal to from.precision(),
+// from.layout()
+#ifndef LITE_WITH_FPGA
    io_copy_output_arg->AsArg().type =
        LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
+#endif
    auto* io_copy_inst = graph->NewInstructNode();

    bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -77,6 +77,7 @@ class Optimizer {
 #endif
           "static_kernel_pick_pass",        // pick original kernel from graph
           "variable_place_inference_pass",  // inference arg/var's
+           "kernel_place_correct_pass",
           // info(target/precision/layout/device)
           // using kernel info
           "argument_type_display_pass",  // debug pass: show arg-type-node's
@@ -108,7 +109,9 @@ class Optimizer {

           "runtime_context_assign_pass",
           "argument_type_display_pass",
+#ifndef LITE_WITH_FPGA
           "memory_optimize_pass",
+#endif
           "npu_subgraph_pass",
           "xpu_subgraph_pass"}};
      RunPasses(passes_local);

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -137,11 +137,16 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {

 void RuntimeProgram::Run() {
  for (auto& inst : instructions_) {
+#ifndef LITE_WITH_FPGA
    if (inst.is_feed_fetch_op()) continue;
+    std::string op_type = inst.op()->op_info()->Type();
+#endif
    inst.Run();
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
+#ifndef LITE_WITH_FPGA
    LITE_PRECISION_PROFILE(inst)
+#endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
  }

--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
--- a/lite/core/version.h.in
+++ b/lite/core/version.h.in
@@ -42,7 +42,7 @@ static std::string version() {

  std::string tag = paddlelite_tag();
  if (tag.empty()) {
-    ss << paddlelite_branch() << "(" << paddlelite_commit() << ")";
+    ss << paddlelite_commit();
  } else {
    ss << tag;
  }

--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
 # C++ Demo
+
+> 欢迎加入PaddleLite百度官方QQ群（696965088），会有专业同学解答您的疑问与困惑。
+
 1. 环境准备
-   - 保证Android NDK在/opt目录下
+   - 一台可以编译PaddleLite的电脑
   - 一台armv7或armv8架构的安卓手机
-2. 编译并运行全量api的demo(注：当编译模式为tiny_pubish时将不存在该demo)
+
+2. 人脸识别和佩戴口罩判断的Demo
+
+参考[源码编译](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/source_compile/)准备编译环境。
+
+执行下面命令，下载PaddleLite代码。
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+```
+
+进入PaddleLite根目录，编译预测库。
+```shell
+./lite/tools/build.sh \
+    --arm_os=android \
+    --arm_abi=armv8 \
+    --arm_lang=gcc \
+    --android_stl=c++_static \
+    --build_extra=ON \
+    --shutdown_log=OFF \
+    tiny_publish
+```
+
+进入编译目录，下载模型和图片的压缩包，编译可执行文件。
+```shell
+cd build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mask_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mask_detection.tar.gz
+tar zxvf mask_detection.tar.gz
+make
+```
+
+当然，大家也可以通过PaddleHub下载人脸检测模型和口罩佩戴判断模型。
+```
+# 下载paddlehub以后，通过python执行以下代码
+import paddlehub as hub
+pyramidbox_lite_mask = hub.Module(name="pyramidbox_lite_mask")
+# 将模型保存在test_program文件夹之中
+pyramidbox_lite_mask.processor.save_inference_model(dirname="test_program") 
+通过以上命令，可以获得人脸检测和口罩佩戴判断模型，分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件，__param__文件是权重文件。
+```
+
+电脑连接安卓手机，将可执行文件、测试图片、模型文件、预测库push到安卓手机上。
+```
+adb push mask_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push face_detection /data/local/tmp
+adb push mask_classification /data/local/tmp
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mask_detection
+```
+
+进入安卓手机，执行demo。
+```
+adb shell
+cd /data/local/tmp
+export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH 
+./mask_detection face_detection mask_classification test.jpg
+```
+
+回到电脑端，将结果取出，查看如下效果图。
+```
+adb pull /data/local/tmp/test_mask_detection_result.jpg ./
+```
+
+![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/74279176-6200cd00-4d55-11ea-9fc0-83cfc2b3b37d.jpg)
+
+3. 编译并运行全量api的demo(注：当编译模式为tiny_pubish时将不存在该demo)
 ```shell
 cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
@@ -17,7 +86,7 @@ adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率

-3. 编译并运行轻量级api的demo
+4. 编译并运行轻量级api的demo
 ```shell
 cd ../mobile_light
 make
@@ -29,7 +98,7 @@ adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率

-4. 编译并运行ssd目标检测的demo
+5. 编译并运行ssd目标检测的demo
 ```shell
 cd ../ssd_detection
 wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
@@ -46,7 +115,7 @@ adb pull /data/local/tmp/test_ssd_detection_result.jpg ./
 ```
 运行成功将在ssd_detection目录下看到生成的目标检测结果图像: test_ssd_detection_result.jpg

-5. 编译并运行yolov3目标检测的demo
+6. 编译并运行yolov3目标检测的demo
 ```shell
 cd ../yolov3_detection
 wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-yolov3.tar.gz
@@ -63,7 +132,7 @@ adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./
 ```
 运行成功将在yolov3_detection目录下看到生成的目标检测结果图像: test_yolov3_detection_result.jpg

-6. 编译并运行物体分类的demo
+7. 编译并运行物体分类的demo
 ```shell
 cd ../mobile_classify
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
@@ -71,41 +140,41 @@ tar zxvf mobilenet_v1.tar.gz
 ./model_optimize_tool optimize model
 make

-adb -s emulator-5554 push mobile_classify /data/local/tmp/
-adb -s emulator-5554 push test.jpg /data/local/tmp/
-adb -s emulator-5554 push labels.txt /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+adb push mobile_classify /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push labels.txt /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobile_classify
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
 ```
 运行成功将在控制台输出预测结果的前5个类别的预测概率
 - 如若想看前10个类别的预测概率，在运行命令输入topk的值即可
    eg:
    ```shell
-    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
    ```
 - 如若想看其他模型的分类结果， 在运行命令输入model_dir 及其model的输入大小即可
    eg:
    ```shell
-    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
    ```
    
-9. 编译含CV预处理库模型单测demo 
+8. 编译含CV预处理库模型单测demo 
 ```shell
 cd ../test_cv
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
 tar zxvf mobilenet_v1.tar.gz
 ./model_optimize_tool optimize model
 make
-adb -s emulator-5554 push test_model_cv /data/local/tmp/
-adb -s emulator-5554 push test.jpg /data/local/tmp/
-adb -s emulator-5554 push labels.txt /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+adb push test_model_cv /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push labels.txt /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/test_model_cv
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率
--- a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mask_detection: fetch_opencv mask_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mask_detection.o: mask_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mask_detection.o
+	rm -f mask_detection
--- a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mask_detection: fetch_opencv mask_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mask_detection.o: mask_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mask_detection.o
+	rm -f mask_detection
--- a/lite/demo/cxx/mask_detection/mask_detection.cc
+++ b/lite/demo/cxx/mask_detection/mask_detection.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+struct Object {
+  int batch_id;
+  cv::Rect rec;
+  int class_id;
+  float prob;
+};
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(const float* din,
+                     float* dout,
+                     int size,
+                     const std::vector<float> mean,
+                     const std::vector<float> scale) {
+  if (mean.size() != 3 || scale.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img,
+                 int width,
+                 int height,
+                 const std::vector<float>& mean,
+                 const std::vector<float>& scale,
+                 float* data,
+                 bool is_scale = false) {
+  cv::Mat resized_img;
+  cv::resize(
+      img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC);
+  cv::Mat imgf;
+  float scale_factor = is_scale ? 1.f / 256 : 1.f;
+  resized_img.convertTo(imgf, CV_32FC3, scale_factor);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, data, width * height, mean, scale);
+}
+
+void RunModel(std::string det_model_dir,
+              std::string class_model_dir,
+              std::string img_path) {
+  // Prepare
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  float shrink = 0.2;
+  int width = img.cols;
+  int height = img.rows;
+  int s_width = static_cast<int>(width * shrink);
+  int s_height = static_cast<int>(height * shrink);
+
+  // Detection
+  MobileConfig config;
+  config.set_model_dir(det_model_dir);
+
+  // Create Predictor For Detction Model
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // Get Input Tensor
+  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
+  input_tensor0->Resize({1, 3, s_height, s_width});
+  auto* data = input_tensor0->mutable_data<float>();
+
+  // Do PreProcess
+  std::vector<float> detect_mean = {104.f, 117.f, 123.f};
+  std::vector<float> detect_scale = {0.007843, 0.007843, 0.007843};
+  pre_process(img, s_width, s_height, detect_mean, detect_scale, data, false);
+
+  // Detection Model Run
+  predictor->Run();
+
+  // Get Output Tensor
+  std::unique_ptr<const Tensor> output_tensor0(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor0->data<float>();
+  auto shape_out = output_tensor0->shape();
+  int64_t out_len = ShapeProduction(shape_out);
+
+  // Filter Out Detection Box
+  float detect_threshold = 0.3;
+  std::vector<Object> detect_result;
+  for (int i = 0; i < out_len / 6; ++i) {
+    if (outptr[1] >= detect_threshold) {
+      Object obj;
+      int xmin = static_cast<int>(width * outptr[2]);
+      int ymin = static_cast<int>(height * outptr[3]);
+      int xmax = static_cast<int>(width * outptr[4]);
+      int ymax = static_cast<int>(height * outptr[5]);
+      int w = xmax - xmin;
+      int h = ymax - ymin;
+      cv::Rect rec_clip =
+          cv::Rect(xmin, ymin, w, h) & cv::Rect(0, 0, width, height);
+      obj.rec = rec_clip;
+      detect_result.push_back(obj);
+    }
+    outptr += 6;
+  }
+
+  // Classification
+  config.set_model_dir(class_model_dir);
+
+  // Create Predictor For Classification Model
+  predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+  // Get Input Tensor
+  std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(0)));
+  int classify_w = 128;
+  int classify_h = 128;
+  input_tensor1->Resize({1, 3, classify_h, classify_w});
+  auto* input_data = input_tensor1->mutable_data<float>();
+  int detect_num = detect_result.size();
+  std::vector<float> classify_mean = {0.5f, 0.5f, 0.5f};
+  std::vector<float> classify_scale = {1.f, 1.f, 1.f};
+  float classify_threshold = 0.5;
+  for (int i = 0; i < detect_num; ++i) {
+    cv::Rect rec_clip = detect_result[i].rec;
+    cv::Mat roi = img(rec_clip);
+
+    // Do PreProcess
+    pre_process(roi,
+                classify_w,
+                classify_h,
+                classify_mean,
+                classify_scale,
+                input_data,
+                true);
+
+    // Classification Model Run
+    predictor->Run();
+
+    // Get Output Tensor
+    std::unique_ptr<const Tensor> output_tensor1(
+        std::move(predictor->GetOutput(1)));
+    auto* outptr = output_tensor1->data<float>();
+
+    // Draw Detection and Classification Results
+    cv::rectangle(img, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
+    std::string text = outptr[1] > classify_threshold ? "wear mask" : "no mask";
+    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+    double font_scale = 1.f;
+    int thickness = 1;
+    cv::Size text_size =
+        cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+    float new_font_scale = rec_clip.width * 0.7 * font_scale / text_size.width;
+    text_size =
+        cv::getTextSize(text, font_face, new_font_scale, thickness, nullptr);
+    cv::Point origin;
+    origin.x = rec_clip.x + 5;
+    origin.y = rec_clip.y + text_size.height + 5;
+    cv::putText(img,
+                text,
+                origin,
+                font_face,
+                new_font_scale,
+                cv::Scalar(0, 255, 255),
+                thickness,
+                cv::LINE_AA);
+
+    std::cout << "detect face, location: x=" << rec_clip.x
+              << ", y=" << rec_clip.y << ", width=" << rec_clip.width
+              << ", height=" << rec_clip.height
+              << ", wear mask: " << (outptr[1] > classify_threshold)
+              << std::endl;
+  }
+
+  // Write Result to Image File
+  int start = img_path.find_last_of("/");
+  int end = img_path.find_last_of(".");
+  std::string img_name = img_path.substr(start + 1, end - start - 1);
+  std::string result_name = img_name + "_mask_detection_result.jpg";
+  cv::imwrite(result_name, img);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " detction_model_dir classification_model_dir image_path\n";
+    exit(1);
+  }
+  std::string detect_model_dir = argv[1];
+  std::string classify_model_dir = argv[2];
+  std::string img_path = argv[3];
+  RunModel(detect_model_dir, classify_model_dir, img_path);
+  return 0;
+}
--- a/lite/demo/cxx/ssd_detection/ssd_detection.cc
+++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc
@@ -82,8 +82,8 @@ void neon_mean_scale(const float* din,
  }
  for (; i < size; i++) {
    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
-    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
-    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
  }
 }

@@ -188,13 +188,12 @@ void RunModel(std::string model_dir, std::string img_path) {
      std::move(predictor->GetOutput(0)));
  auto* outptr = output_tensor->data<float>();
  auto shape_out = output_tensor->shape();
-  int64_t cnt = 1;
-  for (auto& i : shape_out) {
-    cnt *= i;
-  }
+  int64_t cnt = ShapeProduction(shape_out);
  auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.6f, img);
-  std::string result_name =
-      img_path.substr(0, img_path.find(".")) + "_ssd_detection_result.jpg";
+  int start = img_path.find_last_of("/");
+  int end = img_path.find_last_of(".");
+  std::string img_name = img_path.substr(start + 1, end - start - 1);
+  std::string result_name = img_name + "_ssd_detection_result.jpg";
  cv::imwrite(result_name, img);
 }


--- a/lite/gen_code/paddle_infer.h
+++ b/lite/gen_code/paddle_infer.h
@@ -46,7 +46,6 @@ class Tensor {
 */
 class PaddlePredictor {
 public:
-  void Init();

  std::unique_ptr<Tensor> GetTensor(const std::string &id) const;
  std::unique_ptr<Tensor> GetMutableTensor(const std::string &id);

--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -62,6 +62,10 @@ void CastCompute::Run() {
    int32_t* out_data = param.Out->mutable_data<int32_t>();
    std::transform(
        x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
+  } else if (param.in_dtype == 3 && param.out_dtype == 5) {
+    const auto* x_data = param.X->data<float>();
+    auto* o_data = param.Out->mutable_data<float>();
+    memcpy(o_data, x_data, sizeof(float) * param.X->numel());
  } else {
    LOG(FATAL) << "other has not been implemented";
  }

--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
@@ -95,7 +95,7 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {

    CHECK_GE(x_dims.size(), 2UL);
    CHECK_EQ(w_dims.size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
+    CHECK_GE(param.output->dims().size(), 2UL);

    m_ = x_dims.Slice(0, param.in_num_col_dims).production();
    k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();

--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -60,25 +60,10 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
    auto& param = *param_.get_mutable<param_t>();
    auto& context = ctx_->As<ARMContext>();

-    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-      auto data = param.Out->template mutable_data<float>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT32)) {
-      auto data = param.Out->template mutable_data<int32_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT8)) {
-      auto data = param.Out->template mutable_data<int8_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else {
-      LOG(FATAL) << "not supported dtype " << param.dtype;
+    // auto data = param.Out->template mutable_data<T>();
+    auto data = param.Out->template mutable_data<float>();
+    for (int i = 0; i < param.Out->numel(); i++) {
+      data[i] = param.value;
    }
  }

@@ -94,32 +79,38 @@ class FillConstantBatchLikeCompute
    auto& param = *param_.get_mutable<param_t>();
    auto& context = ctx_->As<ARMContext>();

-    if (param.input->lod().size() && param.input_dim_idx == 0) {
-      auto odims = param.out->dims();
-      odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
-      param.out->Resize(odims);
+    // auto data = param.out->template mutable_data<T>();
+    auto data = param.out->template mutable_data<float>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
    }

-    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-      auto data = param.out->template mutable_data<float>();
-      for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT32)) {
-      auto data = param.out->template mutable_data<int32_t>();
-      for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT8)) {
-      auto data = param.out->template mutable_data<int8_t>();
-      for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else {
-      LOG(FATAL) << "not supported dtype " << param.dtype;
-    }
+    // if (param.input->lod().size() && param.input_dim_idx == 0) {
+    //   auto odims = param.out->dims();
+    //   odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
+    //   param.out->Resize(odims);
+    // }
+
+    // if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
+    //   auto data = param.out->template mutable_data<float>();
+    //   for (int i = 0; i < param.out->numel(); i++) {
+    //     data[i] = param.value;
+    //   }
+    // } else if (param.dtype ==
+    //            static_cast<int32_t>(lite::core::FluidType::INT32)) {
+    //   auto data = param.out->template mutable_data<int32_t>();
+    //   for (int i = 0; i < param.out->numel(); i++) {
+    //     data[i] = param.value;
+    //   }
+    // } else if (param.dtype ==
+    //            static_cast<int32_t>(lite::core::FluidType::INT8)) {
+    //   auto data = param.out->template mutable_data<int8_t>();
+    //   for (int i = 0; i < param.out->numel(); i++) {
+    //     data[i] = param.value;
+    //   }
+    // } else {
+    //   LOG(FATAL) << "not supported dtype " << param.dtype;
+    // }
  }

  virtual ~FillConstantBatchLikeCompute() = default;
@@ -142,8 +133,9 @@ REGISTER_LITE_KERNEL(fill_constant,
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("ShapeTensorList",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+
 REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
                     kARM,
                     kAny,

--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -36,7 +36,7 @@ void LookupTableCompute::Run() {

  auto table_dim = w->dims();
  int64_t ids_numel = ids->numel();
-  auto ids_data = ids->data<int64_t>();
+  auto ids_data = ids->data<float>();

  int64_t row_number = table_dim[0];
  int64_t row_width = table_dim[1];
@@ -75,7 +75,6 @@ REGISTER_LITE_KERNEL(lookup_table,
    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
-
 REGISTER_LITE_KERNEL(lookup_table_v2,
                     kARM,
                     kFloat,

--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -15,7 +15,12 @@ lite_cc_library(subgraph_bridge_softmax_op_bm SRCS softmax_op.cc DEPS ${subgraph
 lite_cc_library(subgraph_bridge_mul_op_bm SRCS mul_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_batch_norm_op_bm SRCS batch_norm_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_scale_op_bm SRCS scale_op.cc DEPS ${bm_subgraph_bridge_deps})
-
+lite_cc_library(subgraph_bridge_concat_op_bm SRCS concat_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_dropout_op_bm SRCS dropout_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_transpose_op_bm SRCS transpose_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reshape_op_bm SRCS reshape_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_norm_op_bm SRCS norm_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_prior_box_op_bm SRCS prior_box_op.cc DEPS ${bm_subgraph_bridge_deps})
 set(bm_subgraph_bridges
        subgraph_bridge_registry
        subgraph_bridge_engine
@@ -28,4 +33,10 @@ set(bm_subgraph_bridges
        subgraph_bridge_mul_op_bm
        subgraph_bridge_batch_norm_op_bm
        subgraph_bridge_scale_op_bm
+        subgraph_bridge_concat_op_bm
+        subgraph_bridge_dropout_op_bm
+        subgraph_bridge_transpose_op_bm
+        subgraph_bridge_reshape_op_bm
+        subgraph_bridge_norm_op_bm
+        subgraph_bridge_prior_box_op_bm
        CACHE INTERNAL "bm_subgraph_bridges")
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
@@ -45,7 +45,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  for (size_t i = 0; i < output_dims.size(); i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
-  CHECK_EQ(op_type, "relu");
+  float alpha = 0.f;
+  if (op_type == "relu") {
+  } else if (op_type == "leaky_relu") {
+    alpha = op_info->GetAttr<float>("alpha");
+  } else {
+    LOG(FATAL) << "[BM] unsupport act type";
+    return FAILED;
+  }
  add_relu_layer(graph->GetCompilerHandle(),
                 const_cast<const int*>(&i_x_shape_data[0]),
                 x_dims.size(),
@@ -53,7 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                 const_cast<const int*>(&i_output_shape_data[0]),
                 output_dims.size(),
                 static_cast<const char*>(output_var_name.c_str()),
-                 0.f,
+                 alpha,
                 -1.f);
  graph->AddNode(output_var_name);
  return SUCCESS;
@@ -65,3 +72,6 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace paddle

 REGISTER_SUBGRAPH_BRIDGE(relu, kBM, paddle::lite::subgraph::bm::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
+                         kBM,
+                         paddle::lite::subgraph::bm::ActConverter);
--- a/lite/kernels/bm/bridges/concat_op.cc
+++ b/lite/kernels/bm/bridges/concat_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_names = op_info->Input("X");
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  const int32_t input_num = x_names.size();
+  int32_t** shape = new int32_t*[input_num];
+  int32_t* dim = new int32_t[input_num];
+  const char** name = new const char*[input_num];
+  for (size_t i = 0; i < x_names.size(); i++) {
+    auto x = scope->FindMutableTensor(x_names[i]);
+    name[i] = x_names[i].c_str();
+    auto x_dims = x->dims();
+    dim[i] = x_dims.size();
+    const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+    shape[i] = new int32_t[x_dims.size()];
+    for (size_t j = 0; j < x_dims.size(); j++) {
+      shape[i][j] = static_cast<int32_t>(x_shape_data[j]);
+    }
+  }
+
+  auto axis = op_info->GetAttr<int>("axis");
+  add_concat_layer(graph->GetCompilerHandle(),
+                   input_num,
+                   shape,
+                   dim,
+                   name,
+                   const_cast<const int*>(&i_output_shape_data[0]),
+                   output_dims.size(),
+                   static_cast<const char*>(output_var_name.c_str()),
+                   axis);
+  for (size_t i = 0; i < x_names.size(); i++) {
+    delete[] shape[i];
+  }
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConcatConverter);
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/operators/conv_op.h"
 #include <bmcompiler_if.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
@@ -58,10 +57,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::vector<int32_t> i_output_shape_data(output_dims.size());

  for (size_t i = 0; i < input_dims.size(); i++) {
-    i_input_shape_data[i] = static_cast<int>(input_shape_data[i]);
+    i_input_shape_data[i] = static_cast<int32_t>(input_shape_data[i]);
  }
  for (size_t i = 0; i < output_dims.size(); i++) {
-    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+    i_output_shape_data[i] = static_cast<int32_t>(output_shape_data[i]);
  }
  const float* filter_data =
      const_cast<const float*>(filter->mutable_data<float>());
@@ -69,7 +68,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
  auto strides = op_info->GetAttr<std::vector<int>>("strides");
  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
-
  add_conv_layer(graph->GetCompilerHandle(),
                 const_cast<const int*>(&i_input_shape_data[0]),
                 input_dims.size(),
@@ -104,3 +102,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(conv2d,
                         kBM,
                         paddle::lite::subgraph::bm::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConvConverter);
--- a/lite/kernels/bm/bridges/dropout_op.cc
+++ b/lite/kernels/bm/bridges/dropout_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+  CHECK_EQ(dropout_implementation, "downgrade_in_infer");
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         1.f - dropout_prob,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         BINARY_MUL,
+                         0);
+
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(dropout,
+                         kBM,
+                         paddle::lite::subgraph::bm::DropoutConverter);
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -14,6 +14,7 @@
 #include <bmcompiler_defs.h>
 #include <bmcompiler_if.h>
 #include <bmcompiler_if_lite.h>
+#include <bmcompiler_op_code.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -68,42 +69,52 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  for (size_t i = 0; i < output_dims.size(); i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
-  if (y_is_const) {
-    CHECK_EQ(op_type, "elementwise_add");
-  }
+  auto axis = op_info->GetAttr<int>("axis");
  int op_code{-1};
+  int eltwise_if_code{-1};
  float coeff[2] = {1.f, 1.f};
  if (op_type == "elementwise_mul") {
-    op_code = 0;
+    op_code = BINARY_MUL;
+    eltwise_if_code = 0;
  } else if (op_type == "elementwise_add") {
-    op_code = 1;
+    op_code = BINARY_ADD;
+    eltwise_if_code = 1;
  } else if (op_type == "elementwise_sub") {
-    op_code = 1;
+    op_code = BINARY_SUB;
+    eltwise_if_code = 1;
    coeff[1] = -1.f;
  } else {
    LOG(FATAL) << "UNSUPPORTED ELTWISE OPERATION: " << op_type;
  }
-  if (!y_is_const) {
-    add_eltwise_layer(graph->GetCompilerHandle(),
-                      input_num,
-                      shape,
-                      dim,
-                      name,
-                      const_cast<const int*>(&i_output_shape_data[0]),
-                      output_dims.size(),
-                      static_cast<const char*>(output_var_name.c_str()),
-                      op_code,
-                      coeff);
-  } else {
-    const float* y_data = const_cast<const float*>(y->mutable_data<float>());
-    const float* x_data = const_cast<const float*>(x->mutable_data<float>());
-    bm_add_const_tensor(graph->GetCompilerHandle(),
-                        name[1],
-                        shape[0],
-                        dim[0],
-                        static_cast<bm_data_type_t>(DTYPE_FP32),
-                        static_cast<const void*>(y_data));
-
+  const float* y_data = const_cast<const float*>(y->mutable_data<float>());
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+  auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims");
+  std::vector<int32_t> i_expand_shape_data(3);
+  if (y_is_const) {
+    if (dim[0] == dim[1] || 2 == dim[0]) {
+      bm_add_const_tensor(graph->GetCompilerHandle(),
+                          name[1],
+                          shape[1],
+                          dim[1],
+                          static_cast<bm_data_type_t>(DTYPE_FP32),
+                          static_cast<const void*>(y_data));
+    } else if (1 == dim[1] && 1 == axis) {
+      add_expand_ndims_layer(graph->GetCompilerHandle(),
+                             name[1],
+                             shape[1],
+                             dim[1],
+                             static_cast<const float*>(y_data),
+                             -1,
+                             2,
+                             static_cast<const char*>(unique_op_name.c_str()));
+      name[1] = static_cast<const char*>(unique_op_name.c_str());
+      dim[1] = 3;
+      i_expand_shape_data[0] = i_y_shape_data[0];
+      i_expand_shape_data[1] = 1;
+      i_expand_shape_data[2] = 1;
+      shape[1] = &i_expand_shape_data[0];
+      y_data = nullptr;
+    }
    add_binary_layer_v2(graph->GetCompilerHandle(),
                        name[0],
                        shape[0],
@@ -111,12 +122,23 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                        0,
                        static_cast<const float*>(x_data),
                        name[1],
-                        shape[0],
-                        dim[0],
+                        shape[1],
+                        dim[1],
                        0,
                        static_cast<const float*>(y_data),
                        static_cast<const char*>(output_var_name.c_str()),
-                        0);
+                        op_code);
+  } else {
+    add_eltwise_layer(graph->GetCompilerHandle(),
+                      input_num,
+                      shape,
+                      dim,
+                      name,
+                      const_cast<const int*>(&i_output_shape_data[0]),
+                      output_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      eltwise_if_code,
+                      coeff);
  }
  delete[] shape;
  delete[] name;
@@ -133,3 +155,9 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
                         kBM,
                         paddle::lite::subgraph::bm::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
--- a/lite/kernels/bm/bridges/mul_op.cc
+++ b/lite/kernels/bm/bridges/mul_op.cc
@@ -41,8 +41,10 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  }
  // add reshape layer
  int i_x_reshape_shape_data[2];
-  for (size_t i = 0; i < 2; i++) {
-    i_x_reshape_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  i_x_reshape_shape_data[0] = static_cast<int>(x_shape_data[0]);
+  i_x_reshape_shape_data[1] = 1;
+  for (size_t i = 1; i < x_dims.size(); i++) {
+    i_x_reshape_shape_data[1] *= static_cast<int>(x_shape_data[i]);
  }
  int reshape_param[] = {0, -1};
  auto unique_op_reshape_name =

--- a/lite/kernels/bm/bridges/norm_op.cc
+++ b/lite/kernels/bm/bridges/norm_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+
+  float one = 1.f;
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  add_normalize_layer(graph->GetCompilerHandle(),
+                      const_cast<const int*>(&i_x_shape_data[0]),
+                      x_dims.size(),
+                      static_cast<const char*>(x_var_name.c_str()),
+                      const_cast<const int*>(&i_output_shape_data[0]),
+                      output_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      static_cast<const char*>(unique_op_name.c_str()),
+                      0,
+                      1,
+                      &one,
+                      epsilon);
+
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(norm, kBM, paddle::lite::subgraph::bm::NormConverter);
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -15,10 +15,24 @@
 #pragma once

 USE_SUBGRAPH_BRIDGE(relu, kBM);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kBM);
 USE_SUBGRAPH_BRIDGE(conv2d, kBM);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kBM);
 USE_SUBGRAPH_BRIDGE(elementwise_add, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kBM);
 USE_SUBGRAPH_BRIDGE(pool2d, kBM);
 USE_SUBGRAPH_BRIDGE(softmax, kBM);
 USE_SUBGRAPH_BRIDGE(mul, kBM);
 USE_SUBGRAPH_BRIDGE(batch_norm, kBM);
 USE_SUBGRAPH_BRIDGE(scale, kBM);
+USE_SUBGRAPH_BRIDGE(concat, kBM);
+USE_SUBGRAPH_BRIDGE(dropout, kBM);
+USE_SUBGRAPH_BRIDGE(transpose, kBM);
+USE_SUBGRAPH_BRIDGE(transpose2, kBM);
+USE_SUBGRAPH_BRIDGE(reshape, kBM);
+USE_SUBGRAPH_BRIDGE(reshape2, kBM);
+USE_SUBGRAPH_BRIDGE(flatten, kBM);
+USE_SUBGRAPH_BRIDGE(flatten2, kBM);
+USE_SUBGRAPH_BRIDGE(norm, kBM);
+USE_SUBGRAPH_BRIDGE(prior_box, kBM);
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
@@ -65,6 +65,12 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (pooling_type == "avg") {
    average_exclusive = op_info->GetAttr<bool>("exclusive");
  }
+  if (global_pooling) {
+    paddings[0] = 0;
+    paddings[1] = 0;
+    ksize[0] = i_x_shape_data[2];
+    ksize[1] = i_x_shape_data[3];
+  }
  add_pooling_layer(
      graph->GetCompilerHandle(),
      const_cast<const int*>(&i_x_shape_data[0]),

--- a/lite/kernels/bm/bridges/prior_box_op.cc
+++ b/lite/kernels/bm/bridges/prior_box_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+typedef struct __tag_st_priorbox_param {
+  std::vector<float> min_sizes;
+  std::vector<float> max_sizes;
+  std::vector<float> aspect_ratios;
+  std::vector<float> variances;
+  float step_w;
+  float step_h;
+  float offset;
+  int32_t img_w;
+  int32_t img_h;
+  int32_t prior_num;
+  bool min_max_aspect_ratios_order;
+  bool clip;
+  bool flip;
+} st_priorbox_param;
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>* output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior->push_back(ar);
+      if (flip) {
+        output_aspect_ratior->push_back(1.0f / ar);
+      }
+    }
+  }
+}
+
+float* compute_priorbox_kernel(OpLite* op, st_priorbox_param* param) {
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto var = scope->FindVar(var_var_name)->GetMutable<lite::Tensor>();
+  std::vector<float> expand_aspect_ratios;
+  ExpandAspectRatios(param->aspect_ratios, param->flip, &expand_aspect_ratios);
+  param->aspect_ratios.clear();
+  for (size_t i = 0; i < expand_aspect_ratios.size(); i++) {
+    param->aspect_ratios.push_back(expand_aspect_ratios[i]);
+  }
+  param->prior_num = param->aspect_ratios.size() * param->min_sizes.size();
+  if (param->max_sizes.size() > 0) {
+    param->prior_num += param->max_sizes.size();
+  }
+  int32_t win1 = in_dims[3];
+  int32_t hin1 = in_dims[2];
+  DDim shape_out({hin1, win1, param->prior_num, 4});
+  boxes->Resize(shape_out);
+  var->Resize(shape_out);
+  // boxes->mutable_data<float>();
+  // var->mutable_data<float>();
+  float* cpu_data =
+      static_cast<float*>(malloc(sizeof(float) * boxes->data_size() * 2));
+  CHECK(cpu_data != nullptr);
+  const int32_t width = in_dims[3];
+  const int32_t height = in_dims[2];
+  int32_t img_width = param->img_w;
+  int32_t img_height = param->img_h;
+  if (img_width == 0 || img_height == 0) {
+    img_width = img_dims[3];
+    img_height = img_dims[2];
+  }
+  float step_w = param->step_w;
+  float step_h = param->step_h;
+  if (step_w == 0.f || step_h == 0.f) {
+    step_w = static_cast<float>(img_width) / width;
+    step_h = static_cast<float>(img_height) / height;
+  }
+  float offset = param->offset;
+  int32_t channel_size = height * width * param->prior_num * 4;
+  int32_t idx = 0;
+  ///////////////////////////////////////////////////////////////////////
+  for (int32_t h = 0; h < height; ++h) {
+    for (int32_t w = 0; w < width; ++w) {
+      float center_x = (w + offset) * step_w;
+      float center_y = (h + offset) * step_h;
+      float box_width = 0.f;
+      float box_height = 0.f;
+      float* min_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
+      float* max_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
+      float* com_buf = reinterpret_cast<float*>(
+          malloc(sizeof(float) * expand_aspect_ratios.size() * 4));
+      CHECK(min_buf != nullptr);
+      CHECK(max_buf != nullptr);
+      CHECK(com_buf != nullptr);
+      // LOG(INFO) << "the number of min_size is " << min_sizes_.size();
+      for (size_t s = 0; s < param->min_sizes.size(); ++s) {
+        int32_t min_idx = 0;
+        int32_t max_idx = 0;
+        int32_t com_idx = 0;
+        int32_t min_size = param->min_sizes[s];
+        //! first prior: aspect_ratio = 1, size = min_size
+        box_width = box_height = min_size;
+        //! xmin
+        min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width;
+        //! ymin
+        min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height;
+        //! xmax
+        min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width;
+        //! ymax
+        min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height;
+        if (param->max_sizes.size() > 0) {
+          int max_size = param->max_sizes[s];
+          //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
+          box_width = box_height = sqrtf(min_size * max_size);
+          //! xmin
+          max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width;
+          //! ymin
+          max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height;
+          //! xmax
+          max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width;
+          //! ymax
+          max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height;
+        }
+        //! rest of priors
+        for (size_t r = 0; r < expand_aspect_ratios.size(); ++r) {
+          float ar = expand_aspect_ratios[r];
+          if (fabs(ar - 1.) < 1e-6) {
+            continue;
+          }
+          box_width = min_size * sqrt(ar);
+          box_height = min_size / sqrt(ar);
+          //! xmin
+          com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width;
+          //! ymin
+          com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height;
+          //! xmax
+          com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width;
+          //! ymax
+          com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height;
+        }
+        if (param->min_max_aspect_ratios_order) {
+          memcpy(cpu_data + idx, min_buf, sizeof(float) * min_idx);
+          idx += min_idx;
+          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
+          idx += max_idx;
+          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
+          idx += com_idx;
+        } else {
+          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
+          idx += com_idx;
+          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
+          idx += max_idx;
+        }
+      }
+      free(min_buf);
+      free(max_buf);
+      free(com_buf);
+    }
+  }
+  //! clip the prior's coordidate such that it is within [0, 1]
+  if (param->clip) {
+    for (int32_t d = 0; d < channel_size; ++d) {
+      cpu_data[d] = std::min(std::max(cpu_data[d], 0.f), 1.f);
+    }
+  }
+  //! set the variance.
+  float* ptr = cpu_data + channel_size;
+  int count = 0;
+  for (int32_t h = 0; h < height; ++h) {
+    for (int32_t w = 0; w < width; ++w) {
+      for (int32_t i = 0; i < param->prior_num; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          ptr[count] = param->variances[j];
+          ++count;
+        }
+      }
+    }
+  }
+  return cpu_data;
+}
+
+int PriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  std::vector<int32_t> i_input_shape_data(in_dims.size());
+  for (size_t i = 0; i < in_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int32_t>(in_dims[i]);
+  }
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // param
+  st_priorbox_param param;
+  param.clip = op_info->GetAttr<bool>("clip");
+  param.min_sizes = op_info->GetAttr<std::vector<float>>("min_sizes");
+  param.max_sizes = op_info->GetAttr<std::vector<float>>("max_sizes");
+  param.aspect_ratios = op_info->GetAttr<std::vector<float>>("aspect_ratios");
+  param.variances = op_info->GetAttr<std::vector<float>>("variances");
+  param.offset = op_info->GetAttr<float>("offset");
+  if (op_info->HasAttr("flip")) {
+    param.flip = op_info->GetAttr<bool>("flip");
+  }
+  if (op_info->HasAttr("img_w")) {
+    param.img_w = op_info->GetAttr<int32_t>("img_w");
+  }
+  if (op_info->HasAttr("img_h")) {
+    param.img_h = op_info->GetAttr<int32_t>("img_h");
+  }
+  if (op_info->HasAttr("step_w")) {
+    param.step_w = op_info->GetAttr<float>("step_w");
+  }
+  if (op_info->HasAttr("step_h")) {
+    param.step_h = op_info->GetAttr<float>("step_h");
+  }
+  if (op_info->HasAttr("prior_num")) {
+    param.prior_num = op_info->GetAttr<int32_t>("prior_num");
+  }
+  if (op_info->HasAttr("min_max_aspect_ratios_order")) {
+    param.min_max_aspect_ratios_order =
+        op_info->GetAttr<bool>("min_max_aspect_ratios_order");
+  }
+  float* cpu_data = compute_priorbox_kernel(op, &param);
+  compute_priorbox_kernel(op, param);
+  auto boxes_dims = boxes->dims();
+  std::vector<int32_t> i_pri_out_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_pri_out_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  i_pri_out_shape_data[0] *= 2;
+  add_priorbox_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_input_shape_data[0]),
+                     in_dims.size(),
+                     static_cast<const char*>(in_var_name.c_str()),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     boxes_dims.size(),
+                     static_cast<const char*>(unique_op_name.c_str()),
+                     static_cast<const float*>(cpu_data),
+                     param.min_sizes.size(),
+                     const_cast<const float*>(&param.min_sizes[0]),
+                     param.max_sizes.size(),
+                     const_cast<const float*>(&param.max_sizes[0]),
+                     param.aspect_ratios.size(),
+                     const_cast<const float*>(&param.aspect_ratios[0]),
+                     static_cast<int>(param.flip),
+                     static_cast<int>(param.clip),
+                     param.variances.size(),
+                     const_cast<const float*>(&param.variances[0]),
+                     param.img_h,
+                     param.img_w,
+                     param.step_h,
+                     param.step_w,
+                     param.offset);
+  std::vector<int32_t> i_output_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  int32_t* shape[2];
+  int dim[2];
+  const char* name[2];
+  dim[0] = boxes_dims.size();
+  dim[1] = boxes_dims.size();
+  name[0] = static_cast<const char*>(boxes_var_name.c_str());
+  name[1] = static_cast<const char*>(var_var_name.c_str());
+  shape[0] = &i_output_shape_data[0];
+  shape[1] = &i_output_shape_data[0];
+  int split_size = 2;
+  add_tf_split_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     boxes_dims.size(),
+                     static_cast<const char*>(unique_op_name.c_str()),
+                     2,
+                     shape,
+                     dim,
+                     name,
+                     boxes_dims.size(),
+                     0,
+                     &split_size,
+                     0);
+  graph->AddNode(boxes_var_name);
+  graph->AddNode(var_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(prior_box,
+                         kBM,
+                         paddle::lite::subgraph::bm::PriorBoxConverter);
--- a/lite/kernels/bm/bridges/reshape_op.cc
+++ b/lite/kernels/bm/bridges/reshape_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int32_t>(x_dims[i]);
+  }
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(output_dims[i]);
+  }
+  // auto axis = op_info->GetAttr<int>("axis");
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       static_cast<const char*>(x_var_name.c_str()),
+                       const_cast<const int*>(&i_x_shape_data[0]),
+                       x_dims.size(),
+                       static_cast<const char*>(output_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       output_dims.size());
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten2,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
--- a/lite/kernels/bm/bridges/softmax_op.cc
+++ b/lite/kernels/bm/bridges/softmax_op.cc
@@ -48,7 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  for (size_t i = 0; i < length; i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
-  auto axis = op_info->GetAttr<int>("axis");
+  int32_t axis = -1;
+  if (op_info->HasAttr("axis")) {
+    axis = op_info->GetAttr<int>("axis");
+  }
  if (axis < 0) {
    axis += x_dims.size();
  }

--- a/lite/kernels/bm/bridges/transpose_op.cc
+++ b/lite/kernels/bm/bridges/transpose_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+  CHECK_EQ(axis.size(), x_dims.size());
+  add_transpose_layer_v2(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         DTYPE_FP32,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         NULL,
+                         const_cast<const int*>(&axis[0]));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kBM,
+                         paddle::lite::subgraph::bm::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kBM,
+                         paddle::lite::subgraph::bm::TransposeConverter);
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -54,7 +54,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  }
  std::string net_name = "paddle_bitmain";
  __bmcompile_opt(
-      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 2);
+      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 1);
  void* bmodel_data = nullptr;
  unsigned int data_size = 0;
  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
@@ -109,7 +109,6 @@ int SubgraphEngine::BuildDeviceProgram() {
                            net_info_->output_dtypes[i],
                            stage.output_shapes[i]);
  }
-
  return status;
 }


--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(one_hot_compute_host Host extra SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})

 #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
 #lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -426,8 +426,14 @@ REGISTER_LITE_KERNEL(multiclass_nms,
                     kNCHW,
                     paddle::lite::kernels::host::MulticlassNmsCompute,
                     def)
-    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("BBoxes",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Scores",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
    .BindOutput("Index",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})

--- a/lite/kernels/host/one_hot_compute.cc
+++ b/lite/kernels/host/one_hot_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/kernels/host/one_hot_compute.h"
+#include "lite/utils/paddle_enforce.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+void OneHotCompute::Run() {
+  auto& param = Param<operators::OneHotParam>();
+  param.Out->mutable_data<float>();
+  int depth = param.depth;
+  if (param.depth_tensor) {
+    auto* depth_tensor = param.depth_tensor;
+    auto* depth_data = depth_tensor->data<int32_t>();
+    depth = depth_data[0];
+    auto in_dims = param.X->dims();
+    DDim out_dims(in_dims);
+    out_dims[out_dims.size() - 1] = depth;
+    param.Out->Resize(out_dims);
+  }
+
+  auto* p_in_data = param.X->data<float>();
+  auto numel = param.X->numel();
+  auto* p_out_data = param.Out->mutable_data<float>();
+
+  for (int i = 0; i < param.Out->numel(); ++i) {
+    p_out_data[i] = 0;
+  }
+
+  if (param.allow_out_of_range) {
+    for (int i = 0; i < numel; ++i) {
+      if (p_in_data[i] >= 0 && p_in_data[i] < param.depth) {
+        *(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0;  // NOLINT
+      }
+    }
+  } else {
+    for (int i = 0; i < numel; ++i) {
+      PADDLE_ENFORCE_GE(
+          p_in_data[i], 0, "Illegal index value, should be at least 0.");
+      PADDLE_ENFORCE_LT(p_in_data[i],
+                        param.depth,
+                        "Illegal index value, should be less than depth (%d).",
+                        param.depth);
+      *(p_out_data + i * param.depth + (int)(p_in_data[i])) = 1.0;  // NOLINT
+    }
+  }
+}
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(one_hot,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::OneHotCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
--- a/lite/kernels/host/one_hot_compute.h
+++ b/lite/kernels/host/one_hot_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class OneHotCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override;
+
+  virtual ~OneHotCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/host/reshape_compute.cc
+++ b/lite/kernels/host/reshape_compute.cc
@@ -46,17 +46,21 @@ REGISTER_LITE_KERNEL(reshape,
                     paddle::lite::kernels::host::ReshapeCompute,
                     def)
    .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
    .BindInput("ShapeTensor",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
    .BindInput("Shape",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
    .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
    .Finalize();

 REGISTER_LITE_KERNEL(reshape2,

--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
@@ -34,27 +34,29 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto input_type = kernel->GetInputDeclType("Input");
  CHECK(input_type->precision() == PRECISION(kFloat));
  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
-  auto input = scope->FindMutableTensor(input_name);
+  auto input = scope->FindTensor(input_name);
  auto input_dims = input->dims();
-  CHECK_GE(input_dims.size(), 2UL);
+
  auto w_name = op_info->Input("W").front();
  auto w_type = kernel->GetInputDeclType("W");
  CHECK(w_type->precision() == PRECISION(kFloat));
  CHECK(w_type->layout() == DATALAYOUT(kNCHW));
-  auto w = scope->FindMutableTensor(w_name);
+  auto w = scope->FindTensor(w_name);
  auto w_dims = w->dims();
  CHECK_EQ(w_dims.size(), 2UL);
+
  auto out_name = op_info->Output("Out").front();
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindTensor(out_name);
+  auto out_dims = out->dims();
+
  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
  int m = input_dims.Slice(0, in_num_col_dims).production();
  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
  int n = w_dims[1];
  CHECK_EQ(k * n, w_dims.production());
-  VLOG(3) << "[NPU] input dims: " << input_dims << " w dims: " << w_dims
-          << " m: " << m << " k: " << k << " n: " << n;

  // Create input node and reshape it to (m, k, 1, 1)
  std::shared_ptr<Node> input_node = nullptr;
@@ -76,7 +78,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  transpose_w.Resize({n, k, 1, 1});
  transpose_w.set_persistable(true);
  auto transpose_w_data = transpose_w.mutable_data<float>();
-  auto w_data = w->mutable_data<float>();
+  auto w_data = w->data<float>();
  for (int i = 0; i < k; i++) {
    for (int j = 0; j < n; j++) {
      transpose_w_data[j * k + i] = w_data[i * n + j];
@@ -85,10 +87,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto trans_w_node = graph->Add(w_name, transpose_w);

  // FC node
-  auto fc_node = graph->Add<ge::op::FullConnection>(out_name + "/fc");
+  auto fc_node = graph->Add<ge::op::FullConnection>(out_name);
  auto fc_op = fc_node->data<ge::op::FullConnection>();
  fc_op->set_input_x(*reshaped_input_node->data());
  fc_op->set_input_w(*trans_w_node->data());
+
  // Add bias node if bias tensor exists
  if (HasInputArg(op_info, scope, "Bias")) {
    std::shared_ptr<Node> bias_node = nullptr;
@@ -99,19 +102,23 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      auto bias_type = kernel->GetInputDeclType("Bias");
      CHECK(bias_type->precision() == PRECISION(kFloat));
      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
-      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias = scope->FindTensor(bias_name);
      auto bias_dims = bias->dims();
      CHECK_EQ(bias_dims.production(), n);
      bias_node = graph->Add(bias_name, *bias, {1, n, 1, 1});
    }
    fc_op->set_input_b(*bias_node->data());
  }
-  // Reshape output of FC node from (m, n, 1, 1) to (m, n)
+
+  // Reshape output of FC node from (m, n, 1, 1) to out_shape
  auto reshaped_fc_node = graph->Add<ge::op::Reshape>(out_name);
  auto reshaped_fc_op = reshaped_fc_node->data<ge::op::Reshape>();
  reshaped_fc_op->set_input_tensor(*fc_node->data());
-  reshaped_fc_op->set_attr_shape({m, n});
+  auto out_shape = out_dims.Vectorize();
+  reshaped_fc_op->set_attr_shape(
+      ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
  reshaped_fc_op->set_attr_axis(0);
+
  return REBUILD_WHEN_SHAPE_CHANGED;
 }


--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
@@ -42,7 +42,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
-  auto axis = op_info->GetAttr<int>("axis");
+  int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;
  if (axis < 0) {
    axis += x_rank;
  }

--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -20,7 +20,9 @@ add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(scale_opencl OPENCL basic SRCS scale_compute.cc DEPS ${cl_kernel_deps})

 lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
@@ -83,6 +85,15 @@ lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
 lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
        DEPS layout_opencl op_registry program context cl_image_converter
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc
+        DEPS concat_opencl layout_opencl op_registry program context
+         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+         
 lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
        DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_scale_opencl SRCS scale_compute_test.cc
+             DEPS scale_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
--- a/lite/kernels/opencl/concat_compute.cc
+++ b/lite/kernels/opencl/concat_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/opencl/concat_compute.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+template <>
+void ConcatCompute<PRECISION(kFloat),
+                   DATALAYOUT(kImageDefault)>::PrepareForRun() {
+  auto& context = ctx_->As<OpenCLContext>();
+  concat_param_ = param_.get_mutable<param_t>();
+  if (concat_param_->x.size() == 2) {
+    kernel_func_name_ = "concat2";
+  } else {
+    kernel_func_name_ = "concat_mul";
+  }
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "image/concat_kernel.cl", build_options_);
+  // UpdateParams<kFloat, kImageDefault>();
+  auto axis = concat_param_->axis;
+  auto inputs = concat_param_->x;
+  auto out_dims = concat_param_->output->dims();
+  auto* axis_tensor = concat_param_->axis_tensor;
+  if (axis_tensor != nullptr) {
+    // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+    // axis = axis_tensor_data[0];
+  }
+  auto in_dims = inputs[0]->dims();
+  axis_size_ = out_dims[axis];
+  axis_ = axis;
+  for (int i = 0; i < axis; i++) {
+    pre_size_ *= in_dims[i];
+  }
+  for (int i = axis + 1; i < in_dims.size(); i++) {
+    post_size_ *= in_dims[i];
+  }
+  for (int i = 1; i < inputs.size(); i++) {
+    auto dims = inputs[i]->dims();
+    // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
+    if (in_dims.size() != dims.size()) {
+      printf("input shape must be same \n");
+      return;
+    }
+    for (int i = 0; i < dims.size(); i++) {
+      if (i != axis) {
+        if (in_dims[i] != dims[i]) {
+          printf("input shape must be same \n");
+          return;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+  const auto& x_dims = param.output->dims();
+  auto image_shape = InitImageDimInfoWith(x_dims);
+  auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+  const auto& y_dims = param.output->dims();  // useless: check dim only
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+
+  auto inputs = param.x;
+  int arg_idx = 0;
+  int width = inputs[0]->dims()[-1];
+  auto global_work_size =
+      cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                  static_cast<cl::size_type>(image_shape["height"])};
+  VLOG(4) << TargetToStr(param.output->target());
+  VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+          << image_shape["height"];
+  VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+          << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+  VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+          << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int flag = 1;  // cxw
+  switch (axis_) {
+    case 0:
+      width = x_dims[2];  // n
+      flag = 0;
+      break;
+    case 1:
+      width = x_dims[3];  // c
+      break;
+    case 2:
+      width = x_dims[0];  // h
+      flag = 0;
+      break;
+    case 3:
+    case -1:
+      width = x_dims[1];  // w
+      break;
+    default:
+      printf("this axis: %d does not support \n", axis_);
+  }
+  if (inputs.size() == 2) {
+    auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
+    auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
+    cl_int status = kernel.setArg(arg_idx, *x_buf0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *x_buf1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status =
+        kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, flag);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, width);
+    CL_CHECK_FATAL(status);
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_context()->GetCommandQueue().finish();
+  } else {
+    auto start = 0;
+    for (int i = 0; i < inputs.size(); i++) {
+      arg_idx = 0;
+      auto* x_buf = inputs[i]->data<float, cl::Image2D>();
+      cl_int status = kernel.setArg(arg_idx, *x_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, axis_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, start);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, flag);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, width);
+      CL_CHECK_FATAL(status);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_context()->GetCommandQueue().finish();
+      start += inputs[i]->dims()[axis_];
+    }
+  }
+}
+
+template <>
+std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::doc() {
+  return "Concat using cl::Image, kFloat";
+}
+
+template <>
+void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::PrepareForRun() {
+  auto& context = ctx_->As<OpenCLContext>();
+  concat_param_ = param_.get_mutable<param_t>();
+  if (concat_param_->x.size() == 2) {
+    kernel_func_name_ = "concat2";
+  } else {
+    kernel_func_name_ = "concat_mul";
+  }
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
+
+  //  UpdateParams<kFloat, kImageDefault>();
+  auto axis = concat_param_->axis;
+  auto inputs = concat_param_->x;
+  auto out_dims = concat_param_->output->dims();
+  auto* axis_tensor = concat_param_->axis_tensor;
+  if (axis_tensor != nullptr) {
+    //   auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+    //  axis = axis_tensor_data[0];
+  }
+  auto in_dims = inputs[0]->dims();
+  axis_size_ = out_dims[axis];
+  axis_ = axis;
+  for (int i = 0; i < axis; i++) {
+    pre_size_ *= in_dims[i];
+  }
+  for (int i = axis + 1; i < in_dims.size(); i++) {
+    post_size_ *= in_dims[i];
+  }
+  for (int i = 1; i < inputs.size(); i++) {
+    auto dims = inputs[i]->dims();
+    if (in_dims.size() != dims.size()) {
+      printf("input shape must be same \n");
+      return;
+    }
+    for (int i = 0; i < dims.size(); i++) {
+      if (i != axis) {
+        if (in_dims[i] != dims[i]) {
+          printf("input shape must be same \n");
+          return;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+  const auto& x_dims = param.output->dims();
+  auto image_shape = InitImageDimInfoWith(x_dims);
+  auto* out_buf =
+      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  const auto& y_dims = param.output->dims();  // useless: check dim only
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+
+  auto inputs = param.x;
+  int arg_idx = 0;
+  auto global_work_size = cl::NDRange{axis_size_};
+  int total = axis_size_ * post_size_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  if (inputs.size() == 2) {
+    auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
+    auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
+    auto axis0 = inputs[0]->dims()[axis_];
+    int total0 = axis0 * post_size_;
+    int total1 = (axis_size_ - axis0) * post_size_;
+    cl_int status = kernel.setArg(arg_idx, *x_buf0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *x_buf1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, axis_size_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, pre_size_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, post_size_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, total);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, total0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, total1);
+    CL_CHECK_FATAL(status);
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_buf, event_);
+  } else {
+    auto start = 0;
+    for (int i = 0; i < inputs.size(); i++) {
+      arg_idx = 0;
+      int size = inputs[i]->dims()[axis_];
+      auto* x_buf = inputs[i]->data<float, cl::Buffer>();
+      global_work_size = cl::NDRange{static_cast<size_t>(size)};
+      int total0 = size * post_size_;
+      cl_int status = kernel.setArg(arg_idx, *x_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<int>(size));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, pre_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, post_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, start);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total0);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_buf, event_);
+      start += size;
+    }
+  }
+}
+
+template <>
+std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::doc() {
+  return "Concat using cl::Buffer, kFloat";
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
+                                                     DATALAYOUT(kNCHW)>
+    Concat_buffer;
+
+typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
+                                                     DATALAYOUT(kImageDefault)>
+    Concat_image;
+
+REGISTER_LITE_KERNEL(
+    concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
--- a/lite/kernels/opencl/concat_compute.h
+++ b/lite/kernels/opencl/concat_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+template <PrecisionType Ptype, DataLayoutType layout>
+class ConcatCompute : public KernelLite<TARGET(kOpenCL), Ptype, layout> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  std::string doc();  // override;
+
+  // protected:
+  // void UpdateParams();
+
+  int axis_size_ = 1;
+  int post_size_ = 1;
+  int pre_size_ = 1;
+  int axis_ = 1;
+  param_t* concat_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/opencl/concat_compute_test.cc
+++ b/lite/kernels/opencl/concat_compute_test.cc
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -362,6 +362,20 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

    impl_ = &ConvImageCompute::Conv2d1x1;
+  } else if (kernel_h == 3 && kernel_h == 3) {
+    // conv2d_3x3
+    kernel_func_names_.push_back("conv2d_3x3");
+    kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d3x3;
  } else if (kernel_h == 5 && kernel_w == 5) {
    // conv2d_5x5
    kernel_func_names_.push_back("conv2d_5x5");
@@ -582,6 +596,184 @@ void ConvImageCompute::Conv2d1x1() {
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
 }
+
+void ConvImageCompute::Conv2d3x3() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  int filter_channel = filter_dims[1];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  // re-calc group
+  int new_groups{param.groups};
+  if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
+    new_groups = 1;
+  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
+    new_groups = input_channel / filter_channel;
+  }
+  /* TODO(ysh329): mobile has no case below
+     else {
+      LOG(FATAL) << "Not support conv3x3 case with"
+                 << " input_dims:" << input_dims << " output_dims:" <<
+    output_dims
+                 << " filter_dims:" << filter_dims;
+    }
+  */
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "param.groups(groups):" << param.groups;
+  VLOG(4) << "new_groups:" << new_groups;
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, new_groups);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
 void ConvImageCompute::Conv2d5x5() {
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
@@ -726,6 +918,7 @@ void ConvImageCompute::Conv2d5x5() {
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
 }
+
 void ConvImageCompute::Conv2d7x7() {
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();

--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
@@ -71,6 +71,7 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),

 private:
  void Conv2d1x1();
+  void Conv2d3x3();
  void Conv2d5x5();
  void Conv2d7x7();


--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc
--- a/lite/kernels/opencl/scale_compute.cc
+++ b/lite/kernels/opencl/scale_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ScaleParam;
+
+  std::string doc() const override { return "Scale using cl::Image2D, kFloat"; }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/scale_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    auto* x_img = param.x->data<float, cl::Image2D>();
+    const float scale = param.scale;
+    const float bias = param.bias;
+
+    LOG(INFO) << "x_image" << x_img;
+    auto out_image_shape = InitImageDimInfoWith(in_dims);
+    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
+              << out_image_shape["height"];
+    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+    LOG(INFO) << "out_image" << out_img;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
+                    static_cast<cl::size_type>(out_image_shape["height"])};
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, scale);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, bias);
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"scale"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(scale,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ScaleComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/scale_compute_test.cc
+++ b/lite/kernels/opencl/scale_compute_test.cc
--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
@@ -41,7 +41,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
-  auto axis = op_info->GetAttr<int>("axis");
+  int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;

  // X node
  std::shared_ptr<Node> x_node = nullptr;

--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/model_parser.h
+++ b/lite/model_parser/model_parser.h
--- a/lite/model_parser/model_parser_test.cc
+++ b/lite/model_parser/model_parser_test.cc
--- a/lite/model_parser/naive_buffer/naive_buffer.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer.cc
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -135,6 +135,8 @@ add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
 add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
 add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})

+add_operator(one_hot basic SRCS one_hot_op.cc DEPS ${op_DEPS})
+
 if (NOT LITE_WITH_X86)
    lite_cc_test(test_fc_op SRCS fc_op_test.cc
                DEPS fc_op memory

--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
--- a/lite/operators/batch_norm_op_test.cc
+++ b/lite/operators/batch_norm_op_test.cc
--- a/lite/operators/fake_quantize_range_abs_max.cc
+++ b/lite/operators/fake_quantize_range_abs_max.cc
--- a/lite/operators/fake_quantize_range_abs_max.h
+++ b/lite/operators/fake_quantize_range_abs_max.h
--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
--- a/lite/operators/one_hot_op.cc
+++ b/lite/operators/one_hot_op.cc
--- a/lite/operators/one_hot_op.h
+++ b/lite/operators/one_hot_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_fpga.sh
+++ b/lite/tools/build_fpga.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/mobile/src/fpga/KD/pes/conv_pe.hpp
+++ b/mobile/src/fpga/KD/pes/conv_pe.hpp