Merge branch 'temp' into reduce_sum

c31afacb · alinag · 49913fa4 · 2090d0f9 · c31afacb · c31afacb
143 changed file
--- a/docs/Makefile
+++ b/docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
--- a/docs/advanced_user_guides/add_layout.md
+++ b/docs/advanced_user_guides/add_layout.md
+# 如何增加Layout
+
+Paddle-Lite中Place包含了Target、Layout、Precision信息，用来注册和选择模型中的具体Kernel。下面以增加Place中的layout：`ImageDefault`、`ImageFolder`、`ImageNW`为例，讲解如何增加新Layout。
+
+根据在`lite/core/`、`lite/api`目录下以`NHWC`为关键词检索代码，发现需要分别在以下的文件中加入Layout内容：
+
+1. lite/api/paddle_place.h
+2. lite/api/paddle_place.cc
+3. lite/api/python/pybind/pybind.cc
+4. lite/core/op_registry.h
+5. lite/core/op_registry.cc
+
+## 1. lite/api/paddle_place.h
+
+在`enum class DataLayoutType`中加入对应的Layout，注意已有的Layout不能改变值，增加新Layout递增即可：
+
+```cpp
+enum class DataLayoutType : int {
+  kUnk = 0,
+  kNCHW = 1,
+  kNHWC = 3,
+  kImageDefault = 4,  // for opencl image2d
+  kImageFolder = 5,   // for opencl image2d
+  kImageNW = 6,       // for opencl image2d
+  kAny = 2,           // any data layout
+  NUM = 7,            // number of fields.
+};
+```
+
+## 2. lite/api/paddle_place.cc
+
+本文件有3处修改，注意在` DataLayoutToStr`函数中加入对应Layout的字符串名，顺序为`lite/api/paddle_place.h`中枚举值的顺序：
+
+```cpp
+// 该文件第1处
+const std::string& DataLayoutToStr(DataLayoutType layout) {
+  static const std::string datalayout2string[] = {
+      "unk", "NCHW", "any", "NHWC", "ImageDefault", "ImageFolder", "ImageNW"};
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
+}
+
+// 该文件第2处
+const std::string& DataLayoutRepr(DataLayoutType layout) {
+  static const std::string datalayout2string[] = {"kUnk",
+                                                  "kNCHW",
+                                                  "kAny",
+                                                  "kNHWC",
+                                                  "kImageDefault",
+                                                  "kImageFolder",
+                                                  "kImageNW"};
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
+}
+
+// 该文件第3处
+std::set<DataLayoutType> ExpandValidLayouts(DataLayoutType layout) {
+  static const std::set<DataLayoutType> valid_set({DATALAYOUT(kNCHW),
+                                                   DATALAYOUT(kAny),
+                                                   DATALAYOUT(kNHWC),
+                                                   DATALAYOUT(kImageDefault),
+                                                   DATALAYOUT(kImageFolder),
+                                                   DATALAYOUT(kImageNW)});
+  if (layout == DATALAYOUT(kAny)) {
+    return valid_set;
+  }
+  return std::set<DataLayoutType>({layout});
+}
+```
+
+## 3. lite/api/python/pybind/pybind.cc
+
+```cpp
+  // DataLayoutType
+  py::enum_<DataLayoutType>(*m, "DataLayoutType")
+      .value("NCHW", DataLayoutType::kNCHW)
+      .value("NHWC", DataLayoutType::kNHWC)
+      .value("ImageDefault", DataLayoutType::kImageDefault)
+      .value("ImageFolder", DataLayoutType::kImageFolder)
+      .value("ImageNW", DataLayoutType::kImageNW)
+      .value("Any", DataLayoutType::kAny);
+```
+
+## 4. lite/core/op_registry.h
+
+找到KernelRegister final中的`using any_kernel_registor_t =`，加入下面修改信息：
+
+```cpp
+// 找到KernelRegister final中的`using any_kernel_registor_t =`
+// 加入如下内容：
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageNW)> *,  //
+```
+
+
+## 5. lite/core/op_registry.cc
+
+该文件有2处修改：
+
+```cpp
+// 该文件第1处
+#define CREATE_KERNEL1(target__, precision__)                                \
+  switch (layout) {                                                          \
+    case DATALAYOUT(kNCHW):                                                  \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kNCHW)>(op_type);                             \
+    case DATALAYOUT(kAny):                                                   \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kAny)>(op_type);                              \
+    case DATALAYOUT(kNHWC):                                                  \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kNHWC)>(op_type);                             \
+    case DATALAYOUT(kImageDefault):                                          \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageDefault)>(op_type);                     \
+    case DATALAYOUT(kImageFolder):                                           \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageFolder)>(op_type);                      \
+    case DATALAYOUT(kImageNW):                                               \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageNW)>(op_type);                          \
+    default:                                                                 \
+      LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \
+  }
+
+// 该文件第2处
+// 找到文件中的下面的函数
+KernelRegistry::KernelRegistry()
+    : registries_(static_cast<int>(TARGET(NUM)) *
+                  static_cast<int>(PRECISION(NUM)) *
+                  static_cast<int>(DATALAYOUT(NUM)))
+
+// 在该函数中加入新增Layout的下面内容
+  INIT_FOR(kOpenCL, kFP16, kNCHW);
+  INIT_FOR(kOpenCL, kFP16, kNHWC);
+  INIT_FOR(kOpenCL, kFP16, kImageDefault);
+  INIT_FOR(kOpenCL, kFP16, kImageFolder);
+  INIT_FOR(kOpenCL, kFP16, kImageNW);
+  INIT_FOR(kOpenCL, kFloat, kImageDefault);
+  INIT_FOR(kOpenCL, kFloat, kImageFolder);
+  INIT_FOR(kOpenCL, kFloat, kImageNW);
+  INIT_FOR(kOpenCL, kAny, kImageDefault);
+  INIT_FOR(kOpenCL, kAny, kImageFolder);
+  INIT_FOR(kOpenCL, kAny, kImageNW);
+```
--- a/docs/advanced_user_guides/add_new_pass.md
+++ b/docs/advanced_user_guides/add_new_pass.md
+
+# 新增Pass方法
+
+本文从三个方面介绍了`Lite`中的`Pass`结构：**Pass是什么**、**Pass的实现与接口**、**Pass的一般注册流程**。最后以`Fc_fuse_pass`为例介绍了`fusion_pass`的作用与注册方法。
+
+## 前述：Pass是什么？
+
+**CxxPredictor加载模型后，在执行预测前会先优化模型。模型优化过程是通过Pass实现的。**
+具体调用关系如下：
+![图片](https://user-images.githubusercontent.com/45189361/69638690-20d21880-1096-11ea-8169-1d2c7e1a1609.png)
+
+ - `CreatePredictor(CxxConfig)`函数调用了Predictor->Build(CxxConfig)
+   - CxxPredictor的构建过程（Build）分为两步：
+     - Predictor->LoadModel()          加载模型文件到program中
+     - Predicotr->optimizer_.Run()    对Program中的原始图形结构进行优化
+          - 对图结构的优化是通过调用 `Pass->Apply(const std::unique_ptr<SSAGraph>& graph)`方法实现的。
+
+
+**每一类Pass定义了一种优化过程**，包括：原模型中的kernel选取、OP融合、冗余OP去除、子图创建、内存优化、类型推导、类型转换等。
+
+
+
+
+## Pass的实现与接口 ：Pass基类、PassManager和Pass注册
+
+### 1、Pass基类：`paddle::lite::mir::Pass`
+```c++
+class Pass {
+ public:
+  // Pass的类型，Pass按照作用的不同可以分为三种
+  enum class Kind {   //种类的作用不太清楚
+    // 1. 修改模型中的图拓扑结构的Pass
+    kProgramWise = 0,
+    // 2. 不修改图结构，修改状态的Pass
+    kStmtWise,     
+    // 3. 不修改 IR，用于搜集信息和可视化信息的Pass.
+    kDebug,
+  };
+  
+  // 主要实现函数：Apply 函数定义了 Pass 运行时执行的操作
+  virtual void Apply(const std::unique_ptr<SSAGraph>& graph) = 0;
+
+  bool is_program_pass() const { return kind_ == Kind::kProgramWise; }
+  bool is_stmt_pass() const { return kind_ == Kind::kStmtWise; }
+
+  virtual ~Pass() = default;
+
+ private:
+  const Kind kind_;  // pass 的种类
+  std::string name_; // pass 的名称
+  std::set<TargetType> bound_targets_; // 指定了Pass运行的硬件平台，模型优化过程会根据当前硬件平台是否匹配筛选Pass。
+  std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_; // 绑定的kernel
+};
+
+
+// Different kinds.
+class ProgramPass : public Pass {
+ public:
+  ProgramPass() : Pass(Kind::kProgramWise) {}
+};
+class StmtPass : public Pass {
+ public:
+  StmtPass() : Pass(Kind::kStmtWise) {}
+};
+
+class DebugPass : public Pass {
+ public:
+  DebugPass() : Pass(Kind::kDebug) {}
+};
+```
+**代码位置**：`lite/core/mir/pass.h`
+**主要类成员**：
+  `const Kind kind_` : Pass类型。pass 有三种基本基本类型 ：修改图结构的`ProgramPass`、修改状态量的`StmtPass`和Debug过程采集信息与控制可视化的`DebugPass`。  
+  `std::string name_` ：pass 的名称
+  `std::set<TargetType> bound_targets_` : Pass运行的硬件平台，optimizer.Run()优化过程会根据硬件平台选择匹配的Pass。------根据硬件平台自动选择需要的pass
+  `std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_` : Pass 绑定的kernel   (what's this used for)
+**主要接口**： 
+  `Pass::Apply(const std::unique_ptr& graph)` : Pass优化过程的具体操作，是新注册Pass需要实现的接口。输入为`SSAGraph`型指针，是对模型结构的拓扑表示。
+
+### 2、Pass管理 `paddle::lite::mir::PassManager` 
+
+```c++
+class PassManager {
+ public:
+  // 内部静态变量PassManager，用来存储使用的Pass和图优化操作
+  static PassManager& Global() {
+    static PassManager x;
+    return x;
+  }
+ 
+ // 执行所有的 Pass 
+ void Run(const std::unique_ptr<SSAGraph>& graph) {
+    for (auto& pass : passes_) {
+      LOG(INFO) << "Running MIR pass " << pass->name();
+      pass->Apply(graph);
+    }
+
+ private:
+  std::list<std::unique_ptr> passes_;  //存储所有的 Pass
+  std::map<std::string, mir::Pass*> pass_map_;    //使用map变量存储 PassName::Pass
+  
+ }
+
+```
+**代码位置**：`lite/core/mir/pass_manager.h`
+**主要类成员**：
+`std::list:unique_ptr> passes_;`  : List类型，存储了所有已注册Pass。
+`std::map<std::string, mir::Pass*> pass_map_; `  :   Map类型，存储了所有"Pass名称-Pass类"键对，用于根据名称查找Pass。
+
+**主要接口**：
+ `static PassManager& Global()` 返回PassManager全局静态变量,该变量存储了所有已注册的Pass
+` bool AddNewPass(const std::string& name, Pass* pass)` 添加新的Pass到PassManager中
+
+
+### 3、 Pass 注册 `paddle::lite::mir::PassRegistry`
+**代码位置**：`lite/core/mir/pass_registry.h`
+**主要接口**：
+`REGISTER_MIR_PASS(name__, class__)` ：宏定义函数，用于注册Pass。注册Pass过程实现的是 `PassManager::Global().AddNewPass(name__, class__)`，将新注册Pass添加到全局变量`PassManager`中。
+
+
+
+## Pass的一般注册流程与使用方法
+
+### 1. Pass 注册流程
+在`lite/core/mir`或其子目录下继承`Pass基类`，实现`Pass::Apply`接口，并使用宏`REGISTER_MIR_PASS(name__, class__)`将Pass注册到`PassManager`即完成了新Pass注册。
+
+**以新建 **`new_demo_pass`**为例**，具体流程如下：
+（1）在`lite/core/mir`路径下新建`example_pass.cc` 和 `new_demo_pass.h` 文件
+（2）在`example_pass.h` 文件中继承Pass基类（ProgramPass、StmtPass或DebugPass）定义自己的Pass类。
+```c++
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+class ExamplePass : public ProgramPass {
+  void Apply(const std::unique_ptr<SSAGraph> &graph) override {}
+   ...
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+```
+（3）在`example_pass.cc` 文件中实现`ExamplePass::Apply()`接口，并注册`ExamplePass`
+```c++
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/example_pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+void ExamplePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+    ...
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(example_pass, paddle::lite::mir::ExamplePass)
+    .BindTargets({TARGET(kARM)}); // Pass执行的目标硬件平台
+    // .BindKernel("conv2d");     //Pass绑定的 kernel
+```
+
+（4）修改`lite/core/mir/CMakeLists.txt`文件，将`example_pass.cc` 编译到`mir_passes`库中
+
+```cmake
+lite_cc_library(mir_passes
+  SRCS
+      demo_pass.cc  // 新建的Pass文件
+      ...
+      memory_optimize_pass.cc
+  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
+```
+### 2. Pass使用流程
+
+将Pass注册到PassManager后不会自动生效。需要在`optimizer->run()` 函数中添加该Pass才会在模型优化过程中调用。
+（1）在`paddle_use_passes.h`文件中调用该Pass
+
+```cmake
+#include "paddle_lite_factory_helper.h"  // NOLINT
+    ...
+USE_MIR_PASS(new_demo_pass);  //调用 new_demo_pass
+```
+（2）要想在优化模型时调用该Pass，需要在`optimizer->run()`函数中手动添加调用。
+
+修改`lite/core/optimizer.h`文件，添加`new_demo_pass`到`Optimizer::Run()`函数；
+```c++
+ class Optimizer {
+ public:
+  void Run(...) {
+   ...
+    if (passes.empty()) {
+      RunPasses(std::vector<std::string>{
+          {"new_demo_pass"     //将新注册的Pass添加在这里
+             ...
+           }
+    ...
+ }      
+```
+（3）只有CxxPredictor才会在模型加载后根据Pass优化模型。
+```c++
+ ...
+#include "paddle_use_passes.h"   // 引用Pass优化模型
+void RunModel() {
+  // 1. 创建 CxxConfig
+  CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places(Place{TARGET(kARM), PRECISION(kFloat)});
+
+  // 2. 创建CxxPredictor,该过程包括加载模型和用Pass优化模型
+  std::shared_ptr> predictor =
+      Creat<CxxConfig>(config);
+}
+```
+
+
+
+
+## Fusion Pass的定义与注册
+
+`Fusion Pass`是一种常见图结构优化Pass，可将多个连续OP融合成单个等效OP，减少数据交换并简化图结构。Pass运行时调用`Fuser`自动查找并替换指定图结构，所以注册`FuserPass`时还需要实现对应的Fuser类。
+
+下面以`fc_fuse_pass`为例，详细说明`FusionPass`的效果和注册方法。
+
+### `fc_fuse_pass`的作用
+将相邻的`mul`算子和 `element_wise add `算子 融合成一个 `FC`  算子
+```c++
+mul(X) =  X * W 
+elementwise_add( mul(x) ) = X * W + Bias
+//----------> after fusion
+FC(X) = X * W +Bias
+```
+
+Pass 运行效果如下：
+![图片](https://user-images.githubusercontent.com/45189361/69639193-12383100-1097-11ea-9063-21f030414080.png)
+mul和elementwise_add的原有参数映射到FC的参数上：
+![图片](https://user-images.githubusercontent.com/45189361/69638836-74446680-1096-11ea-9cdc-a961fa995dfe.png)
+
+### `fc_fuse_pass`的注册方法
+#### 1、创建FcFuser
+（1）在`lite/core/mir/fusion`路径下新建`fc_fuser.cc` 和 `fc_fuser.h` 文件
+（2）在`fc_fuser.h` 文件中继承`FuseBase`定义自己的Fuser类。
+
+```c++
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class FcFuser : public FuseBase {
+ public:
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+```
+**主要接口**：
+`FuseBase::BuildPattern` ：  描述需要替换位置的图结构（pattern），Fuser运行时会自动查找并替换该pattern。
+`FuseBase::GenOpDesc` ：       创建融合后的等效Fused_op。
+`FuseBase::InsertNewNode` ：用Fused_op替换原始图结构（pattern）。
+
+对于 `FcFuser`：BuildPattern描述的Pattern是`mul+elementwise add`，GenOpDesc创建的FC_op，InsertNewNode函数的效果是用新建的`FC_op`替换模型中的`mul+elementwise add` pattern。
+
+
+（3） 在`fc_fuser.cc`文件中实现 `BuildPattern()` 、`GenOpDesc()`、`InsertNewNode() `接口
+
+下面以FcFuser为例介绍三种接口的实现：
+
+```c++
+// 1. BuildPattern函数，描述需要替换的图结构
+// FcFuser::BuildPattern() 描述了 mul + element_wise add 图结构
+void FcFuser::BuildPattern() {
+  // （1） 用OpNode描述和VarNode
+  // mul OP
+  auto* mul = OpNode("mul", "mul");
+  // mul OP 的输入和输出
+  auto* x = VarNode("x")->assert_is_op_input("mul", "X");
+  auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
+  auto* mul_out = VarNode("mul_out");
+  
+  // elementwise_add OP
+  auto* add = OpNode("add", "elementwise_add");
+  //elementwise_add 的输入
+  auto* b = VarNode("b")->assert_is_persistable_var();
+  // elementwise_add OP的输出（最终输出）
+  auto* Out = VarNode("Out");
+
+  //（2） 描述拓扑连接 （Fuse之前mul 和elementwise_add的连接）
+  std::vector<PMNode*> mul_inputs{W, x};
+  std::vector<PMNode*> add_inputs{mul_out, b};
+  mul_inputs >> *mul >> *mul_out;
+  add_inputs >> *add >> *Out;
+ 
+
+  //（3） 声明新的拓扑结构中将会被移除的节点，包括被fuse的OP和OP之间的中间变量
+  mul_out->AsIntermediate();
+  mul->AsIntermediate();
+  add->AsIntermediate();
+}
+
+
+// 2. GenOpDesc函数新建等效 Fused_op
+// FcFuser::GenOpDesc() 新建了Fc_op
+cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
+  // (1) 得到第一个OP节点的 OpDesc ，并清空输入输出信息
+  cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
+  op_desc.mutable_inputs()->clear();
+  op_desc.mutable_outputs()->clear();
+  // (2) 修改OpDesc , 将OpType设置为 "fc" (FC OP 的OP_type)，
+  op_desc.SetType("fc");
+  // (3) 设置OpDesc中的Input、Output、Attrbute。分别连接到BuildPattern（）函数中创建的VarNode
+  op_desc.SetInput("Input", {matched.at("x")->arg()->name});
+  op_desc.SetInput("W", {matched.at("W")->arg()->name});
+  op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
+  op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
+  op_desc.SetAttr(
+      "in_num_col_dims",
+      matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+  return op_desc;
+}
+
+// 3. InsertNewNode函数用Fused OP 替换模型图中的原始 Pattern
+// FcFuser::InsertNewNode() 用Fc_OP替换原始模型图中的  " mul + element_wise add "
+void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
+  // (1) 创建FC OP的参数（OpDesc）
+  auto op_desc = GenOpDesc(matched);
+  // 创建一个 FC OP
+  auto fc_op = LiteOpRegistry::Global().Create("fc");
+  
+  // 找到原拓扑结构中的scope (作用域)和 valid_places （可支持设备类型）
+  auto mul = matched.at("mul")->stmt()->op();
+  auto* scope = mul->scope();
+  auto& valid_places = mul->valid_places();
+  
+  // (2) 将 FC OP的 scope和 valid_places设置与fuse前相同，并在图中创建该节点（node）
+  fc_op->Attach(op_desc, scope);
+  auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
+  
+  // (3) 将FC节点连接到输入输出（var_node）
+  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("b"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
+}
+```
+
+#### 2、注册fc_fuse_pass
+
+（1）在`lite/core/mir/fusion`路径下新建`fc_fuse_pass.cc` 和 `fc_fuse_pass.h` 文件
+（2）在`fc_fuse_pass.h` 文件中，继承`ProgramPass`定义`FcFusePass`。
+
+```c++
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+class FcFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override; namespace mir namespace lite namespace paddle
+```
+（3）在`fc_fuse_pass.cc` 文件中实现`FcFusePass::Apply()`接口，并注册`FcFusePass`
+```c++
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/example_pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  fusion::FcFuser fuser;
+  fuser(graph.get());namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
+    .BindTargets({TARGET(kAny)})  // FcFusePass 可以在任何硬件平台执行
+    .BindKernel("fc");            // FcFusePass 绑定 fc_kernel
+```
+
+（4）修改`lite/core/mir/fusion/CMakeLists.txt`文件，将`fc_fuser.cc` 编译到`mir_fusers`库
+
+```cmake
+lite_cc_library(fuse_fc
+        SRCS fc_fuser.cc
+        DEPS pattern_matcher_high_api) 
+
+set(mir_fusers
+    fuse_fc
+     ... 
+    CACHE INTERNAL "fusers")
+```
+
+（5）修改`lite/core/mir/CMakeLists.txt`文件，将`fc_fuse_pass.cc` 编译到`mir_pass`库
+```cmake
+lite_cc_library(mir_passes
+  SRCS
+      fusion/fc_fuse_pass.cc
+       ...
+  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
+```
+
+#### 3、使用 fc_fuse_pass
+
+（1） `lite/api/paddle_use_passes.h`使用`USE_LITE_PASS`宏来引入新加入的pass
+
+```c++
+USE_MIR_PASS(lite_fc_fuse_pass);
+```
+（2）  在`lite/core/optimizer.h`文件的`Optimizer::Run()`函数中添加新注册的pass
+```C++
+class Optimizer {
+ public:
+  void Run(Program&& program,
+           const std::vector<Place>& valid_places,
+           core::KernelPickFactor kernel_pick_factor,
+           const std::vector<std::string>& passes = {}) {
+           ...    
+    if (passes.empty()) {
+      RunPasses(std::vector<std::string>{
+          {"lite_fc_fuse_pass",                // the newly registered pass
+            ...
+           "argument_type_display_pass"}});
+    } else {
+      RunPasses(passes);
+    }
+    exec_scope_ = program.exec_scope();
+  }
+```
+（3） 以上修改完成后，在CreatePredictor（CxxConfig）创建CxxPredictor时，模型优化过程会调用`lite_fc_fuse_pass `，扫描`mul + element_wise add`结构并替换为等效的Fc_OP。
--- a/docs/advanced_user_guides/cv.md
+++ b/docs/advanced_user_guides/cv.md
+# CV 图像预处理API接口介绍
+
+请把编译脚本`Paddle-Lite/lite/too/build.sh`中`BUILD_CV`变量设置为`ON`， 其他编译参数设置请参考[源码编译](../source_compile)， 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去，且会生成`paddle_image_preprocess.h`的API文件
+
+- 硬件平台： `ARM`
+- 操作系统：`MAC` 和 `LINUX`
+
+## CV 图像预处理功能
+
+Lite 支持不同颜色空间的图像相互转换 `Convert` 、缩放 `Resize` 、翻转 `Flip`、旋转 `Rotate` 和图像数据转换为 `Tensor` 存储`ImageToTensor` 功能，下文将详细介绍每个功能的API接口。
+
+### CV 枚举变量和结构体变量
+
+- 颜色空间
+```cpp
+enum ImageFormat {
+  RGBA = 0,
+  BGRA,
+  RGB,
+  BGR,
+  GRAY,
+  NV21 = 11,
+  NV12,
+};
+```
+- 翻转参数
+```cpp
+enum FlipParam {
+  X = 0,  // flip along the X axis
+  Y,      // flip along the Y axis
+  XY      // flip along the XY axis
+};
+```
+- 转换参数
+```cpp
+typedef struct {
+  int ih;                // input height
+  int iw;                // input width
+  int oh;                // outpu theight
+  int ow;                // output width
+  FlipParam flip_param;  // flip, support x, y, xy
+  float rotate_param;    // rotate, support 90, 180, 270
+} TransParam;
+```
+
+### ImagePreprocess 类的成员变量
+
+`ImagePreprocess` 类含有以下三个私有成员变量，通过构造函数进行初始化。
+```cpp
+private:
+  ImageFormat srcFormat_; // input image color format
+  ImageFormat dstFormat_; // output image color format
+  TransParam transParam_; // image transform parameter
+
+// init
+ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, TransParam param) {
+  this->srcFormat_ = srcFormat;
+  this->dstFormat_ = dstFormat;
+  this->transParam_ = param;
+}
+```
+
+### 颜色空间转换 Convert
+
+`Convert` 函数支持颜色空间：GRAY、NV12（NV21）、RGB（BGR）和RGBA（BGRA）
+
+ 目前支持以下颜色空间的相互转换：
+    - GRAY2BGR
+    - GRAY2RGB
+    - BGR2RGB
+    - BGRA2BGR
+    - BGRA2RGB
+    - RGBA2RGB
+    - RGBA2BGR
+    - BGRA2RGBA
+
+ 目前支持以下颜色空间的单向转换：
+    - NV12—BGR
+    - NV21—BGR
+    - NV12—RGB
+    - NV21—RGB
+    - NV12—BGRA
+    - NV21—BGRA
+    - NV12—RGBA
+    - NV21—RGBA
+
+ `Convert` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageCovert(const uint8_t* src,
+    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat);
+    ```
+
+    + 第一个 `imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`srcFormat_`
+        - param dstFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+    
+    - 第二个`imageCovert` 接口，可以直接使用
+
+### 缩放 Resize
+
+`Resize` 功能支持颜色空间：GRAY、NV12（NV21）、RGB（BGR）和RGBA（BGRA）
+`Resize` 功能目前支持的方法：`bilinear`
+
+ `Resize` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, int dstw, int dsth);
+    ```
+
+    + 第一个`imageResize` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.iw`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.ih`
+        - param dstw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param dsth：`ImagePreprocess` 类的成员变量`transParam_.ow`
+    
+    - 第二个`imageResize` 接口，可以直接使用
+
+### 旋转 Rotate
+
+`Rotate` 功能支持颜色空间：GRAY、RGB（BGR）和RGBA（BGRA）
+`Rotate` 功能目前支持的角度：90、180 和 270
+
+ `Rotate` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, float degree);
+    ```
+
+    + 第一个`imageRotate` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.oh`
+        - param degree：`ImagePreprocess` 类的成员变量`transParam_.rotate_param`
+    
+    - 第二个`imageRotate` 接口，可以直接使用
+
+### 翻转 Flip
+
+`Flip` 功能支持颜色空间：GRAY、RGB（BGR）和RGBA（BGRA）
+`Flip` 功能目前支持的功能：沿X轴翻转、沿Y轴翻转和沿XY轴翻转
+
+ `Flip` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, FlipParam flip_param);
+    ```
+
+    + 第一个`imageFlip` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.oh`
+        - param flip_param：`ImagePreprocess` 类的成员变量`transParam_.flip_param`
+    
+    - 第二个`imageFlip` 接口，可以直接使用
+
+### Image2Tensor
+
+`Image2Tensor` 功能支持颜色空间：RGB（BGR）和RGBA（BGRA）
+`Image2Tensor` 功能目前支持的Layout：`NCHW`和 `NHWC`
+`Image2Tensor` 不仅完成图像转换为`Tensor`数据处理，而且还完成了图像数据的归一化处理
+
+ `Image2Tensor` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::image2Tensor(const uint8_t* src, Tensor* dstTensor, LayoutType layout, float* means, float* scales);
+    // 方法二
+    void ImagePreprocess::image2Tensor(const uint8_t* src, Tensor* dstTensor, ImageFormat srcFormat,  srcw, int srch, LayoutType layout, float* means, float* scales;
+    ```
+
+    + 第一个`image2Tensor` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.oh`
+    
+    - 第二个`image2Tensor` 接口，可以直接使用
+
+
+
+## CV 图像预处理 Demo 示例
+
+例子：输入 `1920x1080` 大小的 `NV12` 图像src，输出 `960x540` 大小 `RGB` 格式的图像dst；然后，完成 `90` 度旋转和沿 `X` 轴翻转功能；最后，用 `NHWC` 格式存储在Tensor里。
+
+定义 `ImagePreprocess` 类的对象，初始化成员变量
+
+```cpp
+// init
+srcFormat = ImageFormat::NV12;
+dstFormat = ImageFormat::RGB;
+srch = 1920;
+srcw = 1080;
+dsth = 960;
+dstw = 540;
+flip_param = FlipParam::X;
+degree = 90;
+layout = LayoutType::NHWC
+// 方法一: 
+TransParam tparam;
+tparam.ih = srch;
+tparam.iw = srcw;
+tparam.oh = dsth;
+tparam.ow = dstw;
+tparam.flip_param = flip_param;
+tparam.rotate_param = degree;
+ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+// 方法二: 
+ImagePreprocess image_preprocess();
+```
+
+### imageConvert Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageCovert(src, lite_dst);
+// 方法二: 
+image_preprocess.imageCovert(src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat);
+```
+
+### imageResize Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageResize(lite_dst, resize_tmp);
+// 方法二: 
+image_preprocess.imageResize(lite_dst,resize_tmp, (ImageFormat)dstFormat, srcw,
+srch, dstw, dsth);
+```
+
+### imageRotate Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageRotate(resize_tmp, tv_out_ratote);
+// 方法二: 
+image_preprocess.imageRotate(resize_tmp,tv_out_ratote, (ImageFormat)dstFormat, dstw, dsth, degree);
+```
+
+### imageFlip Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageFlip(tv_out_ratote, tv_out_flip);
+// 方法二: 
+image_preprocess.imageFlip(tv_out_ratote, tv_out_flip, (ImageFormat)dstFormat， dstw, dsth, flip_param);
+```
+
+### image2Tensor Demo
+
+```cpp
+// 方法一: 
+image_preprocess.image2Tensor(tv_out_flip, &dst_tensor, layout, means, scales);
+// 方法二: 
+image_preprocess.image2Tensor(tv_out_flip, &dst_tensor,(ImageFormat)dstFormat, dstw, dsth, layout, means, scales);
+```
--- a/docs/advanced_user_guides/x86.md
+++ b/docs/advanced_user_guides/x86.md
+# 使用X86预测库
+
+Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../installation/source_compile)。
+
+(注意：非docker Linux环境需要是Ubuntu16.04)
+
+## 编译
+
+1、 下载代码
+```bash
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+#需要切换到 release/v2.0.0之后版本
+git checkout <release_tag>
+```
+
+2、 源码编译
+
+```bash
+cd Paddle-Lite
+./lite/tools/build.sh x86
+```
+
+## 编译结果说明
+
+x86编译结果位于 `build.lite.x86/inference_lite_lib`
+**具体内容**说明：
+
+1、 `bin`文件夹：可执行工具文件 `test_model_bin`
+
+2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+
+- `include`  : 头文件
+- `lib` : 库文件
+  - 打包的静态库文件：
+    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
+    - `libpaddle_api_light_bundled.a` ：只包含 light_api 功能的静态库
+  - 打包的动态态库文件：
+    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
+    - `libpaddle_light_api_shared.so`：只包含 light_api 功能的动态库
+
+3、 `third_party` 文件夹：第三方库文件
+
+## x86预测API使用示例
+
+```c++
+#include <gflags/gflags.h>
+#include <iostream>
+#include <vector>
+#include "paddle_api.h"          // NOLINT
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_use_passes.h"   // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+DEFINE_string(optimized_model_dir, "", "Optimized model dir.");
+DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+void RunModel() {
+  // 1. Set CxxConfig
+  CxxConfig config;
+  config.set_model_file(FLAGS_model_dir + "model");
+  config.set_param_file(FLAGS_model_dir + "params");
+
+  config.set_valid_places({
+    lite_api::Place{TARGET(kX86), PRECISION(kFloat)}
+  });
+
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1, 3, 224, 224}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  std::cout << "Output dim: " << output_tensor->shape()[1] << std::endl;
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+    std::cout << "Output[" << i << "]:" << output_tensor->data<float>()[i] << std::endl;
+  }
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
+```
--- a/docs/api_reference/cxx_api_doc.md
+++ b/docs/api_reference/cxx_api_doc.md
+
+# C++ API文档
+
+## CreatePaddlePredictor
+
+```c++
+template <typename ConfigT>
+std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
+```
+
+`CreatePaddlePredictor`用来根据`MobileConfig`构建预测器。
+
+示例：
+
+```c++
+// 设置MobileConfig
+MobileConfig config;
+config.set_model_dir(FLAGS_model_dir);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+参数：
+
+- `config(MobileConfig)` - 用于构建Predictor的配置信息。
+
+返回：`PaddlePredictor`指针
+
+返回类型：`std::shared_ptr<PaddlePredictor>`
+
+## CxxConfig
+
+```c++
+class CxxConfig;
+```
+
+`CxxConfig`用来配置构建CxxPredictor的配置信息，如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。
+
+示例：
+
+```c++
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置工作线程数
+config.set_threads(4);
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+# 设置valid places
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+### `set_model_dir(model_dir)`
+
+设置模型文件夹路径，当需要从磁盘加载非combined模型时使用。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`str`
+
+
+
+### `set_model_file(model_file)`
+
+设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `model_file()`
+
+获取设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型文件路径
+
+返回类型：`str`
+
+
+
+### `set_param_file(param_file)`
+
+设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `param_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `param_file()`
+
+获取设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型参数文件路径
+
+返回类型：`str`
+
+
+
+### `set_valid_places(valid_places)`
+
+设置可用的places列表。
+
+参数：
+
+- `valid_places(list)` - 可用place列表。
+
+返回类型：`None`
+
+示例：
+
+```c++
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置valid places
+# 注意，valid_places列表中Place的排序表明了用户对Place的偏好程度，如用户想优先使用ARM上Int8精度的
+# kernel，则应把Place(TargetType.ARM, PrecisionType.INT8)置于valid_places列表的首位。
+places = [Place(TargetType.ARM, PrecisionType.INT8),
+          Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `threads()`
+
+获取设置的工作线程数。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+
+### `set_x86_math_library_num_threads(threads)`
+
+设置CPU Math库线程数，CPU核心数支持情况下可加速预测。默认为1，并且仅在x86下有效。
+
+参数：
+
+- `threads(int)` - CPU Math库线程数。
+
+返回：`None`
+
+返回类型：`None`
+
+
+### `x86_math_library_num_threads()`
+
+返回CPU Math库线程数，CPU核心数支持情况下可加速预测。仅在x86下有效。
+
+参数：
+
+- `None`
+
+返回：CPU Math库线程数。
+
+返回类型：`int`
+
+## MobileConfig
+
+```c++
+class MobileConfig;
+```
+
+`MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息，如NaiveBuffer格式的模型地址、模型的内存地址(从内存加载模型时使用)、能耗模式、工作线程数等等。
+
+*注意：输入的模型需要使用[Model Optimize Tool](../model_optimize_tool)转化为NaiveBuffer格式的优化模型。*
+
+示例：
+
+```c++
+MobileConfig config;
+// 设置NaiveBuffer格式模型目录，从文件加载模型时使用
+config.set_model_dir(FLAGS_model_dir);
+// 设置工作线程数
+config.set_threads(4);
+// 设置能耗模式
+config.set_power_mode(LITE_POWER_HIGH);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+### `set_model_from_file(model_dir)`
+
+设置模型文件，当需要从磁盘加载模型时使用。
+
+参数：
+
+- `model_dir(std::string)` - 模型文件路径
+
+返回：`None`
+
+返回类型：`void`
+
+### `set_model_dir(model_dir)`
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`set_model_from_file`接口。
+
+设置模型文件夹路径，当需要从磁盘加载模型时使用。
+
+参数：
+
+- `model_dir(std::string)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`std::string`
+
+### `set_model_from_buffer(model_buffer)`
+
+设置模型的内存数据，当需要从内存加载模型时使用。
+
+参数：
+
+- `model_buffer(std::string)` - 内存中的模型数据
+
+返回：`None`
+
+返回类型：`void`
+
+### `set_model_buffer(model_buffer, model_buffer_size, param_buffer, param_buffer_size)`
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`set_model_from_buffer`接口。
+
+设置模型、参数的内存地址，当需要从内存加载模型时使用。
+
+示例：
+
+```c++
+// 读取模型文件到内存
+std::string model_buffer = ReadFile(FLAGS_model_path);
+std::string params_buffer = lite::ReadFile(FLAGS_params_path);
+
+// 设置MobileConfig
+lite_api::MobileConfig config;
+config.set_model_buffer(model_buffer.c_str(), model_buffer.size(), 
+                        params_buffer.c_str(), params_buffer.size());
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+参数：
+
+- `model_buffer(const char*)` - 内存中模型结构数据。
+- `model_buffer_size(size_t)` - 内存中模型结构数据的大小。
+- `param_buffer(const char*)` - 内存中模型参数数据。
+- `param_buffer_size(size_t)` - 内存中模型参数数据的大小。
+
+返回：`None`
+
+返回类型：`Void`
+
+
+
+### `model_from_memory()`
+
+是否从内存中加载模型，当使用`set_model_buffer`接口时返回`true`
+
+参数：
+
+- `None`
+
+返回：是否从内存加载模型
+
+返回类型：`bool`
+
+
+
+### `model_buffer()`
+
+获取内存中模型结构数据。
+
+参数：
+
+- `None`
+
+返回：内存中模型结构数据
+
+返回类型：`const std::string&`
+
+
+
+### `param_buffer()`
+
+获取内存中模型参数数据。
+
+参数：
+
+- `None`
+
+返回：内存中模型结构数据
+
+返回类型：`const std::string&`
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式。
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `threads()`
+
+获取设置的工作线程数。
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+## PaddlePredictor
+
+```c++
+class PaddlePredictor
+```
+
+`PaddlePredictor`是Paddle-Lite的预测器，由`CreatePaddlePredictor`根据`MobileConfig`进行创建。用户可以根据PaddlePredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```c++
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+// 设置MobileConfig
+MobileConfig config;
+config.set_model_dir(FLAGS_model_dir);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+// 获得模型的输入和输出名称
+std::vector<std::string> input_names = predictor->GetInputNames();
+for (int i = 0; i < input_names.size(); i ++) {
+  printf("Input name[%d]: %s\n", i, input_names[i].c_str());
+}
+std::vector<std::string> output_names = predictor->GetOutputNames();
+for (int i = 0; i < output_names.size(); i ++) {
+  printf("Output name[%d]: %s\n", i, output_names[i].c_str());
+}
+
+// 准备输入数据
+// (1)根据index获取输入Tensor
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+// (2)根据名称获取输入Tensor
+// std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInputByName(input_names[0])));
+input_tensor->Resize({1, 3, 224, 224});
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 执行预测
+predictor->Run();
+
+// 获取输出
+// (1)根据index获取输出Tensor
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+// (2)根据名称获取输出Tensor
+// std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(output_names[0])));
+printf("Output dim: %d\n", output_tensor->shape()[1]);
+for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+  printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+}
+```
+
+### `GetInput(index)`
+
+获取输入Tensor指针，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`的指针
+
+返回类型：`std::unique_ptr<Tensor>`
+
+
+
+### `GetOutput(index)`
+
+获取输出Tensor的指针，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出Tensor`的指针
+
+返回类型：`std::unique_ptr<Tensor>`
+
+### `GetInputNames()`
+
+获取所有输入Tensor的名称。
+
+参数：
+
+- `None` 
+
+返回：所有输入Tensor的名称
+
+返回类型：`std::vector<std::string>`
+
+### `GetOutputNames()`
+
+获取所有输出Tensor的名称。
+
+参数：
+
+- `None`
+
+返回：所有输出Tensor的名称
+
+返回类型：`std::vector<std::string>`
+
+### `GetInputByName(name)`
+
+根据名称获取输出Tensor的指针，用来获取模型的输出结果。
+
+参数：
+
+- `name(const std::string)` - 输入Tensor的名称
+
+返回：输入Tensor`的指针
+
+返回类型：`std::unique_ptr<Tensor>`
+
+### `GetTensor(name)`
+
+根据名称获取输出Tensor的指针。
+
+**注意**：`GetTensor`接口是为开发者设计的调试接口，可以输出[转化](../model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`，可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。
+
+参数：
+
+- `name(const std::string)` - Tensor的名称
+
+返回：指向`const Tensor`的指针
+
+返回类型：`std::unique_ptr<const Tensor>`
+
+### `Run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `GetVersion()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`std::string`
+
+## TargetType
+
+```c++
+class TargetType;
+```
+`TargetType`为目标设备硬件类型，用户可以根据应用场景选择硬件平台类型。
+
+枚举型变量`TargetType`的所有可能取值包括：
+
+`{X86, CUDA, ARM, OpenCL, FPGA, NPU}`
+
+
+## PrecisionType
+```c++
+class PrecisionType {FP32};
+```
+`PrecisionType`为模型中Tensor的数据精度，默认值为FP32(float32)。
+
+枚举型变量`PrecisionType`的所有可能取值包括：
+
+`{FP32, INT8, INT32, INT64}`
+
+
+
+
+## DataLayoutType
+
+```c++
+class DataLayoutType {NCHW};
+```
+`DataLayoutType`为Tensor的数据格式，默认值为NCHW（number, channel, height, weigth）。
+
+枚举型变量`DataLayoutType`的所有可能取值包括：
+
+` {NCHW, NHWC}`
+
+
+
+## Place
+```c++
+class Place{
+  TargetType target;
+  PrecisionType precision{FP32};
+  DataLayoutType layout{NCHW}
+}
+```
+`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合，说明运行时的设备类型、数据精度和数据格式。
+
+示例：
+```C++
+Place{TargetType(ARM), PrecisionType(FP32), DataLayoutType(NCHW)}
+```
+
+## PowerMode
+
+```c++
+enum PowerMode;
+```
+
+`PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。
+
+示例：
+
+```c++
+MobileConfig config;
+// 设置NaiveBuffer格式模型目录
+config.set_model_dir(FLAGS_model_dir);
+// 设置能耗模式
+config.set_power_mode(LITE_POWER_HIGH);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+PowerMode详细说明如下：
+
+|         选项         | 说明                                                         |
+| :------------------: | ------------------------------------------------------------ |
+|   LITE_POWER_HIGH    | 绑定大核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Big cluster。如果设置的线程数大于大核数量，则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败，如果失败则进入不绑核模式。 |
+|    LITE_POWER_LOW    | 绑定小核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Little cluster。如果设置的线程数大于小核数量，则会将线程数自动缩放到小核数量。如果找不到小核，则自动进入不绑核模式。 |
+|   LITE_POWER_FULL    | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时，则会自动将线程数缩放到核心数量。 |
+|  LITE_POWER_NO_BIND  | 不绑核运行模式（推荐）。系统根据负载自动调度任务到空闲的CPU核心上。 |
+| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+| LITE_POWER_RAND_LOW  | 轮流绑定小核模式。如果Little cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+
+
+
+## Tensor
+
+```c++
+class Tensor
+```
+
+Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置Shape、数据、LoD信息等。
+
+*注意：用户应使用`PaddlePredictor`的`GetInput`和`GetOuput`接口获取输入/输出的`Tensor`。*
+
+示例：
+
+```c++
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+// 设置MobileConfig
+MobileConfig config;
+config.set_model_dir(FLAGS_model_dir);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+// 准备输入数据, 获取输入Tensor
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+// 设置输入Tensor维度信息
+input_tensor->Resize({1, 3, 224, 224});
+// 设置输入数据
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 执行预测
+predictor->Run();
+
+// 获取输出Tensor
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+// 获取输出Tensor维度
+printf("Output dim: %d\n", output_tensor->shape()[1]);
+// 获取输出Tensor数据
+for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+  printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+}
+```
+
+### `Resize(shape)`
+
+设置Tensor的维度信息。
+
+参数：
+
+- `shape(std::vector<int64_t>)` - 维度信息
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `shape()`
+
+获取Tensor的维度信息。
+
+参数：
+
+- `None`
+
+返回：Tensor的维度信息
+
+返回类型：`std::vector<int64_t>`
+
+
+
+### `data<T>()`
+
+```c++
+template <typename T>
+const T* data() const;
+```
+
+获取Tensor的底层数据的常量指针，根据传入的不同模型类型获取相应数据。用于读取Tensor数据。
+
+示例：
+
+```c++
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+// 如果模型中输出为float类型
+output_tensor->data<float>()
+```
+
+参数：
+
+- `None`
+
+返回：`Tensor`底层数据常量指针
+
+返回类型：`const T*`
+
+
+
+### `mutable_data<T>()`
+
+```c++
+template <typename T>
+T* mutable_data() const;
+```
+
+获取Tensor的底层数据的指针，根据传入的不同模型类型获取相应数据。用于设置Tensor数据。
+
+示例：
+
+```c++
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+// 如果模型中输出为float类型
+auto* data = input_tensor->mutable_data<float>();
+// 设置Tensor数据
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+```
+
+参数：
+
+- `None`
+
+返回：`Tensor`底层数据指针
+
+返回类型：`T*`
+
+
+
+### `SetLoD(lod)`
+
+设置Tensor的LoD信息。
+
+参数：
+
+- `lod(std::vector<std::vector<uint64_t>>)` - Tensor的LoD信息
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `lod()`
+
+获取Tensor的LoD信息
+
+参数：
+
+- `None`
+
+返回：`Tensor`的LoD信息
+
+返回类型：`std::vector<std::vector<uint64_t>>`
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -34,13 +34,21 @@ Welcome to Paddle-Lite's documentation!
  :caption: 使用指南
  :name: sec-user-guides

+  user_guides/model_optimize_tool
+  user_guides/library_tailoring
+  user_guides/cuda
+  user_guides/opencl
+
 .. toctree::
  :maxdepth: 1
  :caption: 进阶使用指南

  advanced_user_guides/support_operation_list
  advanced_user_guides/add_operation
+  advanced_user_guides/add_layout
  advanced_user_guides/model_quantization
+  advanced_user_guides/add_new_pass
+  advanced_user_guides/x86
  
 .. toctree::
  :maxdepth: 1
@@ -50,6 +58,8 @@ Welcome to Paddle-Lite's documentation!
  :maxdepth: 1
  :caption: API文档

+  api_reference/cxx_api_doc
+
 .. toctree::
  :maxdepth: 1
  :caption: FAQ

--- a/docs/installation/library.md
+++ b/docs/installation/library.md
+
+# 预测库说明
+
+Paddle-Lite的编译结果为预测库文件（包括静态库和动态库），具体编译过程参考[源码编译](./source_compile)。
+
+Lite预测库分为**基础预测库**和**全量预测库**：基础预测库只打包了基础模型需要的基础算子，预测库体积较小；全量预测库打包了所有的Lite算子，可以支持更多的模型，但是预测库的体积也更大。 编译时由编译选项 `build_extra`(默认为OFF)控制，`--build_extra=OFF`时编译基础预测库，`--build_extra=ON`时编译全量的预测库。
+
+## 基础预测库
+
+### 编译方法
+编译时设置`--build_extra=OFF` (默认值) 或不指定即可编译出基础预测库。例如：
+
+```
+./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static  tiny_publish
+```
+
+### 基础预测库支持的功能
+
+（1）支持基础CV模型
+
+（2）支持基础的in8量化模型
+
+（3）支持[benchmark测试](../benchmark/benchmark)
+
+
+### 基础预测库支持的基础模型：
+
+1. fluid基础模型（paddle model 提供的基础模型9个）
+
+```
+mobileNetV1     mnasnet     yolov3   ssd_mobilenetv1    shufflenet_v2
+mobileNetV2     resnet50    unet     squeezenet_v11
+```
+
+2. int8量化模型模型
+
+```
+mobilenet_v1   mobilenet_v2   resnet50
+```
+
+### 特点
+  轻量级预测库，体积更小，支持常用的基础模型。
+
+
+
+## 全量预测库
+
+### 编译方法
+编译时设置`--build_extra=ON` 即可编译出全量预测库。例如：
+
+```
+./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static --build_extra=ON tiny_publish
+```
+### 全量预测库功能
+
+（1） 基础预测库所有功能
+
+（2）支持所有Paddle-Lite中注册的所有算子
+
+### 特点
+  支持更多的硬件平台和算子，可以支持更多模型但体量更大。
--- a/docs/installation/source_compile.md
+++ b/docs/installation/source_compile.md
+
 # 源码编译

+Paddle-Lite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`，编译流程如下：
+
+1. 环境准备（选择其一）：Docker交叉编译环境、Linux交叉编译环境
+2. 编译：调用`build.sh`脚本一键编译
+
+## 一、环境准备
+
 目前支持三种编译的环境：

 1. Docker 容器环境，
 2. Linux（推荐 Ubuntu 16.04）环境，
 3. Mac OS 环境。
+
+### 1、 Docker开发环境
+
+[Docker](https://www.docker.com/) 是一个开源的应用容器引擎, 使用沙箱机制创建独立容器，方便运行不同程序。Docker初学者可以参考[Docker使用方法](https://thenewstack.io/docker-station-part-one-essential-docker-concepts-tools-terminology/)正确安装Docker。
+
+#### 准备Docker镜像
+
+有两种方式准备Docker镜像，推荐从Dockerhub直接拉取Docker镜像
+
+```shell
+# 方式一：从Dockerhub直接拉取Docker镜像
+docker pull paddlepaddle/paddle-lite:2.0.0_beta
+
+# 方式二：本地源码编译Docker镜像
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite/lite/tools
+mkdir mobile_image
+cp Dockerfile.mobile mobile_image/Dockerfile
+cd mobile_image
+docker build -t paddlepaddle/paddle-lite .
+
+# 镜像编译成功后，可用`docker images`命令，看到`paddlepaddle/paddle-lite`镜像。
+```
+
+#### 进入Docker容器
+
+在拉取Paddle-Lite仓库代码的上层目录，执行如下代码，进入Docker容器：
+
+```shell
+docker run -it \
+  --name paddlelite_docker \
+  -v $PWD/Paddle-Lite:/Paddle-Lite \
+  --net=host \
+  paddlepaddle/paddle-lite /bin/bash
+```
+
+该命令的含义：将容器命名为`paddlelite_docker`即`<container-name>`，将当前目录下的`Paddle-Lite`文件夹挂载到容器中的`/Paddle-Lite`这个根目录下，并进入容器中。至此，完成Docker环境的准备。
+
+#### Docker常用命令
+
+```shell
+# 退出容器但不停止/关闭容器：键盘同时按住三个键：CTRL + q + p
+
+# 启动停止的容器
+docker start <container-name>
+
+# 从shell进入已启动的容器
+docker attach <container-name>
+
+# 停止正在运行的Docker容器
+docker stop <container-name>
+
+# 重新启动正在运行的Docker容器
+docker restart <container-name>
+
+# 删除Docker容器
+docker rm <container-name>
+```
+
+### 2、Linux 开发环境
+
+#### Android
+
+##### 交叉编译环境要求
+
+- gcc、g++、git、make、wget、python、adb
+- Java environment
+- cmake（建议使用3.10或以上版本）
+- Android NDK (建议ndk-r17c)
+
+##### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip adb curl
+
+# 2. Prepare Java env.
+apt-get install -y default-jdk
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+    tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+    mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \  
+    ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+    ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+
+# 4. Download Android NDK for linux-x86_64
+#     Note: Skip this step if NDK installed
+#     recommand android-ndk-r17c-darwin-x86_64
+#     ref: https://developer.android.com/ndk/downloads
+cd /tmp && curl -O https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
+cd /opt && unzip /tmp/android-ndk-r17c-linux-x86_64.zip
+
+# 5. Add environment ${NDK_ROOT} to `~/.bashrc` 
+echo "export NDK_ROOT=/opt/android-ndk-r17c" >> ~/.bashrc
+source ~/.bashrc
+```
+
+#### ARM Linux
+
+适用于基于 ARMv8 和 ARMv7 架构 CPU 的各种开发板，例如 RK3399，树莓派等，目前支持交叉编译和本地编译两种方式，对于交叉编译方式，在完成目标程序编译后，可通过 scp 方式将程序拷贝到开发板运行。
+
+##### 交叉编译
+
+###### 编译环境要求
+
+- gcc、g++、git、make、wget、python、scp
+- cmake（建议使用3.10或以上版本）
+
+###### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip
+
+# 2. Install arm gcc toolchains
+apt-get install -y --no-install-recommends \
+  g++-arm-linux-gnueabi gcc-arm-linux-gnueabi \
+  g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf \
+  gcc-aarch64-linux-gnu g++-aarch64-linux-gnu 
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+    tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+    mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \  
+    ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+    ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+```
+
+##### 本地编译（直接在RK3399或树莓派上编译）
+
+###### 编译环境要求
+
+- gcc、g++、git、make、wget、python
+- cmake（建议使用3.10或以上版本）
+
+###### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版本类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recomends \
+  gcc g++ make wget python unzip
+
+# 2. install cmake 3.10 or above
+wget https://www.cmake.org/files/v3.10/cmake-3.10.3.tar.gz
+tar -zxvf cmake-3.10.3.tar.gz
+cd cmake-3.10.3
+./configure
+make
+sudo make install
+```
+
+之后可通过cmake --version查看cmake是否安装成功。
+
+至此，完成 Linux 交叉编译环境的准备。
+
+### 3、Mac OS 开发环境
+
+#### 交叉编译环境要求
+
+- gcc、git、make、curl、unzip、java
+- cmake（Android编译请使用3.10版本，IOS编译请使用3.15版本）
+- 编译Android: Android NDK (建议ndk-r17c)
+- 编译IOS: XCode(Version 10.1)
+
+#### 具体步骤
+
+```bash
+# 1. Install basic software
+brew install  curl gcc git make unzip wget
+
+# 2. Install cmake: mac上实现IOS编译和Android编译要求的cmake版本不一致,可以根据需求选择安装。
+# （1）在mac环境编译 Paddle-Lite 的Android版本，需要安装cmake 3.10
+#     mkdir /usr/local/Cellar/cmake/ && cd /usr/local/Cellar/cmake/
+#     wget https://cmake.org/files/v3.10/cmake-3.10.2-Darwin-x86_64.tar.gz
+#     tar zxf ./cmake-3.10.2-Darwin-x86_64.tar.gz
+#     mv cmake-3.10.2-Darwin-x86_64/CMake.app/Contents/ ./3.10.2
+#     ln -s /usr/local/Cellar/cmake/3.10.2/bin/cmake /usr/local/bin/cmake
+# （2）在mac环境编译 Paddle-Lite 的IOS版本，需要安装cmake 3.15
+#     mkdir /usr/local/Cellar/cmake/ && cd /usr/local/Cellar/cmake/
+#     cd /usr/local/Cellar/cmake/
+#     wget https://cmake.org/files/v3.15/cmake-3.15.2-Darwin-x86_64.tar.gz
+#     tar zxf ./cmake-3.15.2-Darwin-x86_64.tar.gz
+#     mv cmake-3.15.2-Darwin-x86_64/CMake.app/Contents/ ./3.15.2
+#     ln -s /usr/local/Cellar/cmake/3.15.2/bin/cmake /usr/local/bin/cmake
+
+# 3. Download Android NDK for Mac
+#     recommand android-ndk-r17c-darwin-x86_64
+#     ref: https://developer.android.com/ndk/downloads
+#     Note: Skip this step if NDK installed
+cd ~/Documents && curl -O https://dl.google.com/android/repository/android-ndk-r17c-darwin-x86_64.zip
+cd ~/Library && unzip ~/Documents/android-ndk-r17c-darwin-x86_64.zip
+
+# 4. Add environment ${NDK_ROOT} to `~/.bash_profile` 
+echo "export NDK_ROOT=~/Library/android-ndk-r17c" >> ~/.bash_profile
+source ~/.bash_profile
+
+# 5. Install Java Environment 
+brew cask install java
+
+# 6. 编译IOS需要安装XCode(Version 10.1)，可以在App Store里安装。安装后需要启动一次并执行下面语句。
+# sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+```
+
+至此，完成 Mac 交叉编译环境的准备。
+
+**注意**: Mac上编译Paddle-Lite的full_publish版本时，Paddle-Lite所在路径中不可以含有中文字符
+
+## 二、编译PaddleLite
+
+### 下载代码
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+git checkout <release-version-tag>
+```
+
+### 编译模式与参数
+
+编译脚本`./lite/tools/build.sh`，支持三种编译模式：
+
+| 编译模式 | 介绍 | 适用对象 |
+|:-------:|-----|:-------:|
+| tiny_publish | 编译移动端部署库，无第三方库依赖 | 用户 |
+| full_publish | 编译移动端部署库，有第三方依赖如protobuf、glags等，含有可将模型转换为无需protobuf依赖的naive buffer格式的工具，供tiny_publish库使用 | 用户 |
+| test | 编译指定`arm_os`、`arm_abi`下的移动端单元测试 | 框架开发者 |
+
+编译脚本`./lite/tools/build.sh`，追加参数说明：
+
+|   参数     |     介绍     |     值     |
+|-----------|-------------|-------------|
+| --arm_os   |必选，选择安装平台     | `android`、`ios`、`ios64`、`armlinux` |
+| --arm_abi  |必选，选择编译的arm版本，其中`armv7hf`为ARMLinux编译时选用| `armv8`、`armv7`、`armv7hf`(仅`armlinux`支持) |
+| --arm_lang |arm_os=android时必选，选择编译器 | `gcc`、`clang`(`clang`当前暂不支持) |
+| --android_stl |arm_os=android时必选，选择静态链接STL或动态链接STL | `c++_static`、`c++_shared`|
+| --build_java | 可选，是否编译java预测库（默认为OFF） | `ON`、`OFF` |
+| --build_extra | 可选，是否编译全量预测库（默认为OFF）。详情可参考[预测库说明](./library.html)。 | `ON`、`OFF` |
+| target |必选，选择编译模式，`tiny_publish`为编译移动端部署库、`full_publish`为带依赖的移动端部署库、`test`为移动端单元测试、`ios`为编译ios端`tiny_publish` | `tiny_publish`、`full_publish`、`test`、 `ios` |
+
+### 编译代码
+
+**<font color="orange" >注意</font>**<font color="orange" >：非开发者建议在编译前使用</font>[**“加速第三方依赖库的下载”**](#id22)<font color="orange" >的方法，加速工程中第三方依赖库的下载与编译。 </font>
+
+#### 编译`tiny publish`动态库
+
+##### Android
+```shell
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --build_extra=OFF \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=OFF \
+  tiny_publish
+```
+##### IOS
+```shell
+./lite/tools/build.sh \
+  --arm_os=ios64 \
+  --arm_abi=armv8 \
+  --build_extra=OFF \
+  ios
+```
+**注意：mac环境编译IOS 时，cmake版本需要高于cmake 3.15；mac环境上编译Android时，cmake版本需要设置为cmake 3.10。**
+
+ios tiny publish支持的编译选项：
+
+* `--arm_os`: 可选ios或者ios64
+* `--arm_abi`: 可选armv7和armv8（**注意**：当`arm_os=ios`时只能选择`arm_abi=armv7`，当`arm_os=ios64`时只能选择`arm_abi=armv8`）
+* 如果mac编译过程中报错："Invalid CMAKE_DEVELOPER_ROOT: does not exist", 运行：
+```shell
+sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+```
+##### ARMLinux
+```shell
+./lite/tools/build.sh \
+  --build_extra=OFF \
+  --arm_os=armlinux \
+  --arm_abi=armv7hf \
+  --arm_lang=gcc \
+  --build_extra=OFF \
+  tiny_publish
+```
+- `--arm_abi`: 树莓派3b使用armv7hf，RK3399使用armv8
+  
+#### 编译`full publish`动态库
+
+##### Android
+```shell
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --build_extra=OFF \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=OFF \
+  full_publish
+```
+##### ARMLinux
+```shell
+./lite/tools/build.sh \
+  --arm_os=armlinux \
+  --arm_abi=armv7hf \
+  --arm_lang=gcc \
+  --build_extra=OFF \
+  full_publish
+```
+- `--arm_abi`: 树莓派3b使用armv7hf，RK3399使用armv8
+  
+### 编译结果说明
+
+**编译最终产物位置**在 `build.lite.xxx.xxx.xxx` 下的 `inference_lite_lib.xxx.xxx` ，如 Android 下 ARMv8 的产物位于`inference_lite_lib.android.armv8`：
+
+![](https://user-images.githubusercontent.com/45189361/65375706-204e8780-dccb-11e9-9816-ab4563ce0963.png)
+
+**目录内容**（可能）如下：
+
+**Full_publish编译结果:**
+
+![](https://user-images.githubusercontent.com/45189361/65375704-19c01000-dccb-11e9-9650-6856c7a5bf82.png)
+
+**Tiny_publish结果:**
+
+![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png)
+
+**IOS编译结果:**
+
+![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png)
+
+
+
+**具体内容**说明：
+
+1、 `bin`文件夹：可执行工具文件 `paddle_code_generator`、`test_model_bin`
+
+2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+
+- `include`  : 头文件
+- `lib` : 库文件
+  - 打包的静态库文件：
+    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
+    - `libpaddle_api_light_bundled.a` ：只包含 light_api 功能的静态库
+  - 打包的动态态库文件：
+    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
+    - `libpaddle_light_api_shared.so`：只包含 light_api 功能的动态库
+
+3、 `demo`文件夹：示例 demo ，包含 C++ demo 和  Java demo。
+
+- `cxx`   ： C++示例 demo
+  - `mobile_full` :  full_api 的使用示例
+  - `mobile_light` : light_api的使用示例
+- `java`  ：Java 示例 demo
+  - `android`  : Java的 Android 示例
+
+4、 `java` 文件夹：包含 Jni 的动态库文件与相应的 Jar 包
+
+- `jar` :  `PaddlePredictor.jar`
+- `so`  : Jni动态链接库  `libpaddle_lite_jni.so`
+
+5、 `third_party` 文件夹：第三方库文件`gflags`
+
+**注意：**
+
+1、 只有当`--arm_os=android` 时才会编译出：
+
+- Java库文件与示例：`Java`和`demo/java`
+
+- 动态库文件:`libpaddle_full_api_shared.so`,`libpaddle_light_api_shared.so`
+
+2、 `tiny_publish`编译结果不包括 C++ demo和 C++ 静态库，但提供 C++ 的 light_api 动态库、 Jni 动态库和Java demo
+
+### 加速第三方依赖库的下载
+
+移动端相关编译所需的第三方库均位于 `<PaddleLite>/third-party` 目录下，默认编译过程中，会利用`git submodule update --init --recursive`链上相关的第三方依赖的仓库。
+
+为加速`full_publish`、`test`编译模式中对`protobuf`等第三方依赖的下载，`build.sh` 和 `ci_build.sh`支持了从国内 CDN 下载第三方依赖的压缩包。
+
+使用方法：`git clone`完`Paddle-Lite`仓库代码后，手动删除本地仓库根目录下的`third-party`目录：
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+git checkout <release-version-tag>
+cd Paddle-Lite
+rm -rf third-party
+```
+
+之后再根据本文档，进行后续编译时，便会忽略第三方依赖对应的`submodule`，改为下载第三方压缩包。
--- a/docs/user_guides/cuda.md
+++ b/docs/user_guides/cuda.md
+# Lite基于CUDA的模型预测
+
+Lite支持在x86_64，arm64架构上（如：TX2）进行CUDA的编译运行。
+
+## 编译
+
+**NOTE：** 如果是在TX2等NVIDIA嵌入式硬件上编译，请使用最新的[Jetpack](https://developer.nvidia.com/embedded/jetpack) 安装依赖库。
+
+
+一： 下载代码
+
+```
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+```
+
+二：编译
+
+```
+# 进入代码目录
+cd Paddle-Lite
+
+# 运行编译脚本
+# 编译结束会在本目录下生成 build_cuda 目录
+# 编译过程中如果提示找不到CUDA，CUDNN，请在环境变量设置CUDA_TOOLKIT_ROOT_DIR, CUDNN_ROOT
+# CUDA_TOOLKIT_ROOT_DIR，CUDNN_ROOT分别表示CUDA，CUDNN的根目录
+./lite/tools/build.sh cuda
+# 如果使用python接口，需要打开build_python选项
+./lite/tools/build.sh --build_python=ON cuda
+```
+
+编译结束会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+
+## 运行
+
+以下以Yolov3模型为例，介绍如何在Nvidia GPU硬件上运行模型。
+
+一： 下载darknet_yolov3模型，模型信息请参考[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/yolov3)
+
+
+```
+# 下载模型
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
+tar -zxf yolov3_infer.tar.gz
+# 下载图片样例
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
+```
+
+二： 运行   
+
+**NOTE:**此处示例使用的是python接口，后续会开放C++接口以及示例。
+
+``` python
+#-*- coding: utf-8 -*-
+from __future__ import print_function
+import sys
+import numpy as np
+import cv2
+sys.path.append('build_cuda/inference_lite_lib/python/lib')
+from lite_core import *
+
+def read_img(im_path, resize_h, resize_w):
+  im = cv2.imread(im_path).astype('float32')
+  im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+  h, w, _ = im.shape
+  im_scale_x = resize_h / float(w)
+  im_scale_y = resize_w / float(h)
+  out_img = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=cv2.INTER_CUBIC)
+  mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, -1))
+  std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, -1))
+  out_img = (out_img / 255.0 - mean) / std
+  out_img = out_img.transpose((2, 0, 1))
+  return out_img
+
+# 配置config
+a = CxxConfig()
+a.set_model_file('./yolov3_infer/__model__') # 指定模型文件路径 
+a.set_param_file('./yolov3_infer/__params__') # 指定参数文件路径
+place_cuda = Place(TargetType.CUDA)
+a.set_valid_places([place_cuda])
+
+# 创建predictor
+predictor = create_paddle_predictor(a)
+
+# 设置输入
+input_tensor = predictor.get_input(0);
+height, width = 608, 608
+input_tensor.resize([1, 3, height, width])
+data = read_img('./kite.jpg', height, width).flatten()
+input_tensor.set_float_data(data, TargetType.CUDA)
+
+in2 = predictor.get_input(1);
+in2.resize([1, 2])
+in2.set_int32_data([height, width], TargetType.CUDA)
+
+# 运行
+predictor.run()
+
+# 获取输出
+output_tensor = predictor.get_output(0);
+
+print (output_tensor.shape())
+# [100L, 6L]
+print (output_tensor.target())
+# TargetType.Host
+print (output_tensor.float_data()[:6])
+# [0.0, 0.9862784743309021, 98.51927185058594, 471.2381286621094, 120.73092651367188, 578.33251953125]
+
+```
+
+**NOTE：** 对CUDA的支持还在持续开发中。
--- a/docs/user_guides/library_tailoring.md
+++ b/docs/user_guides/library_tailoring.md
+
+# 裁剪预测库方法
+
+Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编译会将所有已注册的operator打包到预测库中，造成库文件体积膨胀；**裁剪预测库**能针对具体的模型，只打包优化后该模型需要的operator，有效降低预测库文件大小。
+
+## 效果展示(Tiny_publish Android动态预测库体积)
+
+| 测试模型 | 裁剪开关  | **libpaddle_lite_jni.so** |转化后模型中的OP|
+| ------------------ | ---------------------------- | -------- |------------------|
+| mobilenetv1（armv8） | 裁剪前--build_tailor=OFF | 1.5M                | feed,etch,conv2d,depthwise_conv2d,fc,fpool2d,softmax     |
+| mobilenetv1（armv8） | 裁剪后--build_tailor=ON  |  788K              |feed,etch,conv2d,depthwise_conv2d,fc,fpool2d,softmax|
+| mobilenetv2（armv8） | 裁剪前--build_tailor=OFF  | 1.5M                | feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax |
+| mobilenetv2（armv8） | 裁剪后--build_tailor=ON  |  912K          |feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax|
+| mobilenetv1（armv7） | 裁剪前--build_tailor=OFF    | 938K     |feed,fetch,concat,conv2d,dropout,fc,pool2d,softmax|
+| mobilenetv1（armv7） | 裁剪后--build_tailor=ON  | 607K   |feed,fetch,concat,conv2d,dropout,fc,pool2d,softmax|
+| mobilenetv2（armv7） | 裁剪前--build_tailor=OFF     | 938K | feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax |
+| mobilenetv2（armv7） | 裁剪后--build_tailor=ON  |687K          |feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax|
+
+
+
+
+## 实现过程：
+
+
+### 1、转化模型时记录优化后模型信息
+
+说明：使用model_optimize_tool转化模型时，选择 `--record_tailoring_info =true`  会将优化后模型的OP和kernel信息保存到输出文件夹，这些信息将用于编译裁剪后的动态库。
+注意：需要使用Paddle-Lite 最新版本（release/v2.0.0之后）代码编译出的model_optimize_tool
+例如：
+
+```bash
+./model_optimize_tool     --model_dir=./mobilenet_v1     --optimize_out_type=naive_buffer     --optimize_out=mobilenet_v1NB     --record_tailoring_info =true     --valid_targets=arm
+```
+效果：优化后模型使用的OP和kernel信息被保存在 `mobilenet_v1NB`文件夹中的隐藏文件里了
+
+### 2、根据模型信息编译裁剪后的预测库
+
+说明：编译Paddle-Lite时选择`--build_tailor=ON` ，并且用   `–-opt_model_dir=`   指定优化后的模型的地址
+例如：
+
+```bash
+./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish
+```
+**注意**：上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径
+
+**效果**：编译出来的动态库文件变小，且可以运行优化后的模型。
+
+编译出的C++预测库文件位于  ：
+
+`build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/`
+
+编译出的Java预测库文件位于：
+
+`build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/java/so/`
+
+### 3、运行裁剪后的预测库文件
+
+注意：基于某一模型裁剪出的预测库只能支持优化工具转化后的该模型，例如根据mobilenetV1裁剪出的 full_api预测库只能运行以protobuf格式转化出的模型mobilenetV1_opt_nb， 裁剪出的light_api预测库只能运行以naive_buffer格式转化出的模型mobilenetV1_opt_nb， 运行其他模型可能会出现`segementation fault:undifined op or kernel`。  模型转化方法参考：[使用opt转化模型](./model_optimize_tool))。
+
+
+
+**示例1**：使用裁剪后的light_api预测库运行mobilenetv1
+
+1、执行第二步编译后，light_api的C++ 示例位于
+
+`/Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/demo/cxx/mobile_light`
+
+输入`make`命令执行编译可编译出可执行文件mobilenetv1_light_api
+
+2、使用adb将mobilenetV1_NB模型和mobilenetv1_light_api传到手机后执行demo：
+
+`./mobilenetv1_light_api --model_dir=./mobilenetV1_NB`
+
+注意：`mobilenetV1_NB`是用`mobilenetV1`模型转化的naive_buffer格式模型(不需要设置` --record_tailoring_info =true`，转化流程参考：[使用opt转化模型](./model_optimize_tool))。
+
+
+
+**示例2**：使用裁剪后的full_api预测库运行mobilenetv1
+
+1、执行第二步编译后，full_api的C++ 示例位于
+
+`/Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/demo/cxx/mobile_light`
+
+替换mobilenetv1_full_api.cc代码内容：
+
+```C++
+#include <gflags/gflags.h>
+#include <stdio.h>
+#include <vector>
+#include "paddle_api.h"          // NOLINT
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_use_passes.h"   // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+void RunModel() {
+  // 1. Set CxxConfig
+  CxxConfig config;
+  config.set_model_file(FLAGS_model_dir + "model");
+  config.set_param_file(FLAGS_model_dir + "params");
+
+  std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
+  config.set_valid_places(valid_places);
+
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1, 3, 224, 224}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  printf("Output dim: %d\n", output_tensor->shape()[1]);
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+  }
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
+
+```
+
+2、使用adb将mobilenetV1_PB模型和mobilenetv1_full_api传到手机后执行demo：
+
+`./mobilenetv1_full_api --model_dir=./mobilenetV1_PB`
+
+注意：`mobilenetV1_PB`是用`mobilenetV1`模型转化的protobuf格式模型(不需要设置` --record_tailoring_info =true`，转化流程参考：[使用opt转化模型](./model_optimize_tool))。
+
+## 按模型集合裁剪预测库
+
+为了方便用户使用，我们同时提供了按模型集合进行预测库裁剪的功能。用户可以提供一个模型集合，Model Optimize Tool会根据用户所指定的模型集合分析其**优化后的**模型所需要的算子信息对预测库进行裁剪。使用此功能用户根据自己的需要使用模型集合来对预测库中的算子进行任意裁剪。
+
+使用方法如下所示：
+
+```shell
+# 非combined模型集合
+./model_optimize_tool                     \
+    --model_set_dir=<your_model_set_dir>  \
+    --optimize_out_type=naive_buffer      \
+    --optimize_out=<output_model_set_dir> \
+    --record_tailoring_info=true          \
+    --valid_targets=arm
+   
+# combined模型集合
+./model_optimize_tool                       \
+    --model_set_dir=<your_model_set_dir>    \
+    --optimize_out_type=naive_buffer        \
+    --model_filename=<model_topo_filename>  \
+    --param_filename=<model_param_filename> \
+    --optimize_out=<output_model_set_dir>   \
+    --record_tailoring_info=true            \
+    --valid_targets=arm
+```
+
+经过以上步骤后会在`<output_model_set_dir>`中生成模型集合中各模型对应的NaiveBuffer格式的优化模型。此步会对模型集合中所需算子信息进行搜集并存储到`<output_model_set_dir>`中。下一步编译预测库的流程与使用单模型进行预测库裁剪步骤相同。
+
+**注意：**
+
+1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
+2. 使用非combined参数模型时，模型拓扑文件名应为`__model__`，使用非combined参数模型时，集合中各模型的拓扑与参数名应相同，分别由`--model_filename`和`--param_filename`指定。
+3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
+4. 需要使用Paddle-Lite 最新版本（release/v2.1.0之后）代码编译出的model_optimize_tool。
--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
+
+# 模型转化方法
+
+Lite架构在预测过程中表现出来的高性能得益于其丰富的优化组件，其中包括量化、子图融合、混合调度、Kernel优选等等策略。为了使优化过程更加方便易用，我们提供了**opt**来自动完成优化步骤，输出一个轻量的、最优的可执行模型。具体使用方法介绍如下：
+
+**注意**：release/v2.2.0之前的模型转化工具名称为`model_optimize_tool`，从release/v2.3开始模型转化工具名称修改为`opt`
+
+## 准备opt
+当前获得opt方法有三种：
+
+1. 我们提供当前develop分支编译结果下载：[opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt)、[opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac)
+release/v2.2.0之前版本的model_optimize_tool: [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool)、[model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac)
+
+2. 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择release版本下载对应的转化工具`opt`    
+   (release/v2.2.0之前的转化工具为model_optimize_tool、release/v2.3.0之后为opt)
+
+3. 可以下载Paddle-Lite源码，从源码编译出opt工具
+```bash
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+git checkout <release-version-tag>
+./lite/tools/build.sh build_optimize_tool
+```
+编译结果位于`Paddle-Lite/build.opt/lite/api/opt`
+**注意**：从源码编译opt前需要先[安装Paddle-Lite的开发环境](../installation/source_compile)。
+
+## 使用opt
+
+opt是x86平台上的可执行文件，需要在PC端运行：包括Linux终端和Mac终端。
+
+### 帮助信息
+ 执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
+```bash
+ ./opt
+```
+![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png)
+
+### 功能一：转化模型为Paddle-Lite格式
+opt可以将PaddlePaddle支持的模型转化为Paddle-Lite支持的模型格式，期间执行的操作包括：将protobuf格式的模型文件转化为naive_buffer格式的模型文件，有效降低模型体积；执行“量化、子图融合、混合调度、Kernel优选”等图优化操作，提升其在Paddle-Lite上的运行速度、内存占用等性能指标。
+
+模型优化过程：
+
+（1）准备待优化的PaddlePaddle模型
+
+PaddlePaddle模型有两种保存格式：
+   Combined Param：所有参数信息保存在单个文件`params`中，模型的拓扑信息保存在`__model__`文件中。
+
+![opt_combined_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fcombined_model.png)
+
+   Seperated Param：参数信息分开保存在多个参数文件中，模型的拓扑信息保存在`__model__`文件中。
+![opt_seperated_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fseperated_model.png)
+
+(2) 终端中执行`opt`优化模型
+**使用示例**：转化`mobilenet_v1`模型
+
+```
+./opt --model_dir=./mobilenet_v1 --valid_targets=arm --optimize_out_type=naive_buffer --optimize_out=mobilenet_v1_opt
+```
+以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型，优化后的模型文件为`mobilenet_v1_opt.nb`，转化结果如下图所示：
+
+![opt_resulted_model](https://paddlelite-data.bj.bcebos.com/doc_images/2.png)
+
+
+(3) **更详尽的转化命令**总结：
+
+```shell
+./opt \
+    --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=(arm|opencl|x86|npu|xpu) \
+    --prefer_int8_kernel=(true|false) \
+    --record_tailoring_info =(true|false)
+```
+
+| 选项         | 说明 |
+| ------------------- | ------------------------------------------------------------ |
+| --model_dir         | 待优化的PaddlePaddle模型（非combined形式）的路径 |
+| --model_file        | 待优化的PaddlePaddle模型（combined形式）的网络结构文件路径。 |
+| --param_file        | 待优化的PaddlePaddle模型（combined形式）的权重文件路径。 |
+| --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
+| --optimize_out      | 优化模型的输出路径。                                         |
+| --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
+| --prefer_int8_kernel | 若待优化模型为int8量化模型（如量化训练得到的量化模型），则设置该选项为true以使用int8内核函数进行推理加速，默认为false。                          |
+| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |
+
+* 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
+* 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
+* 优化后的模型包括__model__.nb和param.nb文件。
+
+### 功能二：统计模型算子信息、判断是否支持
+
+opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支持该模型。并可以打印出当前Paddle-Lite的算子支持情况。
+
+（1）使用opt统计模型中算子信息
+
+下面命令可以打印出mobilenet_v1模型中包含的所有算子，并判断在硬件平台`valid_targets`下Paddle-Lite是否支持该模型
+
+`./opt --print_model_ops=true  --model_dir=mobilenet_v1 --valid_targets=arm`
+
+![opt_print_modelops](https://paddlelite-data.bj.bcebos.com/doc_images/3.png)
+
+（2）使用opt打印当前Paddle-Lite支持的算子信息
+
+`./opt --print_all_ops=true`
+
+以上命令可以打印出当前Paddle-Lite支持的所有算子信息，包括OP的数量和每个OP支持哪些硬件平台：
+
+![opt_print_allops](https://paddlelite-data.bj.bcebos.com/doc_images/4.png)
+
+`./opt ----print_supported_ops=true  --valid_targets=x86`
+
+以上命令可以打印出当`valid_targets=x86`时Paddle-Lite支持的所有OP：
+
+![opt_print_supportedops](https://paddlelite-data.bj.bcebos.com/doc_images/5.png)
+
+## 其他功能：合并x2paddle和opt的一键脚本
+
+**背景**：如果想用Paddle-Lite运行第三方来源（tensorflow、caffe、onnx）模型，一般需要经过两次转化。即使用x2paddle工具将第三方模型转化为PaddlePaddle格式，再使用opt将PaddlePaddle模型转化为Padde-Lite可支持格式。
+为了简化这一过程，我们提供一键脚本，将x2paddle转化和opt转化合并：
+
+**一键转化脚本**：[auto_transform.sh](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/auto_transform.sh)
+
+
+**环境要求**：使用`auto_transform.sh`脚本转化第三方模型时，需要先安装x2paddle环境，请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和其环境依赖项。
+
+**使用方法**：
+
+（1）打印帮助帮助信息：` ./auto_transform.sh`
+
+（2）转化模型方法
+
+```bash
+USAGE:
+    auto_transform.sh combines the function of x2paddle and opt, it can 
+    tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form.
+----------------------------------------
+example:
+    ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result
+----------------------------------------
+Arguments about x2paddle:
+    --framework=(tensorflow|caffe|onnx);
+    --model='model file for tensorflow or onnx';
+    --prototxt='proto file for caffe' --weight='weight file for caffe'
+ For TensorFlow:
+   --framework=tensorflow --model=tf_model.pb
+
+ For Caffe:
+   --framework=caffe --prototxt=deploy.prototxt --weight=deploy.caffemodel
+
+ For ONNX
+   --framework=onnx --model=onnx_model.onnx
+
+Arguments about opt:
+    --valid_targets=(arm|opencl|x86|npu|xpu); valid targets on Paddle-Lite.
+    --fluid_save_dir='path to outputed model after x2paddle'
+    --optimize_out='path to outputed Paddle-Lite model'
+----------------------------------------
+```
--- a/docs/user_guides/opencl.md
+++ b/docs/user_guides/opencl.md
+# Lite基于OpenCL的ARM GPU预测
+
+Lite支持在Android系统上运行基于OpenCL的程序，目前支持Ubuntu环境下armv8、armv7的交叉编译。
+
+## 编译
+
+### 编译环境
+
+1. Docker 容器环境；
+2. Linux（推荐 Ubuntu 16.04）环境。
+
+详见 **源码编译指南-环境准备** 章节。
+
+### 编译选项
+
+|参数|介绍|值|
+|--------|--------|--------|
+|--arm_os|代表目标操作系统|目前仅支持且默认为`android`|
+|--arm_abi|代表体系结构类型，支持armv8和armv7|默认为`armv8`即arm64-v8a；`armv7`即armeabi-v7a|
+|--arm_lang|代表编译目标文件所使用的编译器|默认为gcc，支持 gcc和clang两种|
+
+### 编译Paddle-Lite OpenCL库范例
+
+注：以android-armv8-opencl的目标、Docker容器的编译开发环境为例，CMake3.10，android-ndk-r17c位于`/opt/`目录下。
+
+```bash
+# 假设当前位于处于Lite源码根目录下
+
+# 导入NDK_ROOT变量，注意检查您的安装目录若与本示例不同
+export NDK_ROOT=/opt/android-ndk-r17c
+
+# 删除上一次CMake自动生成的.h文件
+rm ./lite/api/paddle_use_kernels.h
+rm ./lite/api/paddle_use_ops.h
+
+# 根据指定编译参数编译
+./lite/tools/ci_build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  build_test_arm_opencl
+```
+
+编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内，这里仅罗列关键产物：
+
+- `cxx`:该目录是编译目标的C++的头文件和库文件;
+- `demo`:该目录包含了两个demo，用来调用使用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`，分别对应`mobile_full`和`mobile_light`文件夹。编译对应的demo仅需在`mobile_full`或`mobile_light`文
+  - `mobile_full`:使用cxx config，可直接加载fluid模型，若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏，详细见代码注释;
+  - `mobile_light`:使用mobile config，只能加载`model_optimize_tool`优化过的模型;
+- `opencl`:该目录存放opencl实现的相关kernel。
+
+```bash
+.
+|-- cxx
+|   |-- include
+|   |   |-- paddle_api.h
+|   |   |-- paddle_image_preprocess.h
+|   |   |-- paddle_lite_factory_helper.h
+|   |   |-- paddle_place.h
+|   |   |-- paddle_use_kernels.h
+|   |   |-- paddle_use_ops.h
+|   |   `-- paddle_use_passes.h
+|   `-- lib
+|       |-- libpaddle_api_full_bundled.a
+|       |-- libpaddle_api_light_bundled.a
+|       |-- libpaddle_full_api_shared.so
+|       `-- libpaddle_light_api_shared.so
+|-- demo
+|   `-- cxx
+|       |-- Makefile.def
+|       |-- README.md
+|       |-- include
+|       |   |-- paddle_api.h
+|       |   |-- paddle_lite_factory_helper.h
+|       |   |-- paddle_place.h
+|       |   |-- paddle_use_kernels.h
+|       |   |-- paddle_use_ops.h
+|       |   `-- paddle_use_passes.h
+|       |-- mobile_full
+|       |   |-- Makefile
+|       |   `-- mobilenetv1_full_api.cc
+|       `-- mobile_light
+|           |-- Makefile
+|           `-- mobilenetv1_light_api.cc
+`-- opencl
+    `-- cl_kernel
+        |-- buffer
+        |   |-- depthwise_conv2d_kernel.cl
+        |   |-- elementwise_add_kernel.cl
+        |   |-- fc_kernel.cl
+        |   |-- im2col_kernel.cl
+        |   |-- layout_kernel.cl
+        |   |-- mat_mul_kernel.cl
+        |   |-- pool_kernel.cl
+        |   `-- relu_kernel.cl
+        |-- cl_common.h
+        `-- image
+            |-- channel_add_kernel.cl
+            |-- elementwise_add_kernel.cl
+            |-- pool_kernel.cl
+            `-- relu_kernel.cl
+```
+
+调用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`见下一部分运行示例。
+
+
+
+## 运行示例
+
+下面以android、ARMv8、gcc的环境为例，介绍3个示例，分别如何在手机上执行基于OpenCL的ARM GPU推理过程。
+
+
+**注意：** 以下命令均在Lite源码根目录下运行。在3个示例前，下面这段命令都先要执行用来准备环境:
+
+```bash
+# 在/data/local/tmp目录下创建OpenCL文件目录
+adb shell mkdir -p /data/local/tmp/opencl
+adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/buffer
+adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/image
+
+# 将OpenCL的kernels文件推送到/data/local/tmp/opencl目录下
+adb push lite/backends/opencl/cl_kernel/cl_common.h /data/local/tmp/opencl/cl_kernel/
+adb push lite/backends/opencl/cl_kernel/buffer/* /data/local/tmp/opencl/cl_kernel/buffer/
+adb push lite/backends/opencl/cl_kernel/image/* /data/local/tmp/opencl/cl_kernel/image/
+```
+
+### 运行示例1: 编译产物demo示例
+
+```bash
+######################################################################
+# 编译mobile_full的demo                                              #
+######################################################################
+# 步骤:                                                              #
+#   0.确保编译Paddle-Lite时编译了OpenCL;                             #
+#   1.编辑`mobilenetv1_full_api.cc`代码, 开启`DEMO_USE_OPENCL`的宏;  #
+#   2.在产物目录`demo/cxx/mobile_full`下编译`mobile_full`的demo;     #
+#   3.上传demo, 模型, opencl kernel文件到手机;                       #
+#   4.运行demo得到预期结果.                                          #
+######################################################################
+adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
+chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api
+adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api /data/local/tmp/opencl/
+adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
+
+# use mobile_full run mobilenet_v1
+# `GLOG_v` is log level
+adb shell "export GLOG_v=0; \
+    /data/local/tmp/opencl/mobilenetv1_full_api \
+    --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
+    --optimized_model_dir=/data/local/tmp/opencl/full_api_opt_model"
+
+
+
+######################################################################
+# 编译mobile_light的demo                                             #
+######################################################################
+# 步骤:                                                              #
+#   0.确保编译Paddle-Lite时编译了OpenCL;                             #
+#   1.编译model_optimize_tool并对模型优化, `targets`参数为`opencl`;  #
+#   2.在产物目录`demo/cxx/mobile_light`下编译`mobile_light`的demo;   #
+#   3.上传demo, 模型, opencl kernel文件到手机;                       #
+#   4.运行demo得到预期结果.                                          #
+######################################################################
+
+# use model_optimize_tool to optimize model
+./build.model_optimize_tool/lite/api/model_optimize_tool \
+  --model_dir=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
+  --optimize_out_type=naive_buffer \
+  --optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
+  --valid_targets=opencl
+
+adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
+chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api
+adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/
+adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
+
+# use mobile_light run mobilenet_v1
+adb shell "export GLOG_v=5; \
+  /data/local/tmp/opencl/mobilenetv1_light_api \
+  --model_dir=/data/local/tmp/opencl/"
+```
+
+### 运行示例2: test_mobilenetv1单元测试
+
+- **运行文件准备**
+
+```bash
+# 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下
+adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1
+adb push build.lite.android.armv8.gcc.opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/
+
+# 将OpenCL单元测试程序test_mobilenetv1，推送到/data/local/tmp/opencl目录下
+adb push build.lite.android.armv8.gcc.opencl/lite/api/test_mobilenetv1 /data/local/tmp/opencl
+```
+
+- **执行OpenCL推理过程**
+
+使用如下命令运行OpenCL程序。其中：
+
+- `--cl_path`指定了OpenCL的kernels文件即cl\_kernel所在目录；
+- `--modle_dir`指定了模型文件所在目录。
+
+```bash
+adb shell chmod +x /data/local/tmp/opencl/test_mobilenetv1
+
+adb shell /data/local/tmp/opencl/test_mobilenetv1 \
+  --cl_path=/data/local/tmp/opencl \
+  --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
+  --warmup=1 \
+  --repeats=1
+```
+
+**注意：** 因为权重参数均会在Op Kernel第一次运行时进行加载，所以第一次的执行时间会略长。一般将warmup的值设为1，repeats值设为多次。
+
+### 运行示例3: test_layout_opencl单元测试
+
+- **运行文件准备**
+
+```bash
+# 将OpenCL单元测试程序test_layout_opencl，推送到/data/local/tmp/opencl目录下
+adb push build.lite.android.armv8.gcc.opencl/lite/kernels/opencl/test_layout_opencl /data/local/tmp/opencl/
+```
+
+
+OpenCL推理过程**
+
+```bash
+adb shell chmod +x /data/local/tmp/opencl/test_layout_opencl
+adb shell /data/local/tmp/opencl/test_layout_opencl
+```
+
+
+# 如何在Code中使用
+
+见运行示例1的demo代码:
+
+1. [./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
+2. [./lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc).
+
+注：这里给出的链接会跳转到线上最新develop分支的代码，很可能与您本地的代码存在差异，建议参考自己本地位于`lite/demo/cxx/`目录的代码，查看如何使用。
+
+**NOTE：** 对OpenCL的支持还在持续开发中。
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -232,6 +232,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -251,6 +253,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
            )
            add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
        endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -296,10 +296,10 @@ if (LITE_ON_TINY_PUBLISH)
 endif()

 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-    message(STATUS "Compiling model_optimize_tool")
-    lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
+    message(STATUS "Compiling opt")
+    lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
        DEPS gflags kernel op optimizer mir_passes utils)
-    add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
+    add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)

 lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light

--- a/lite/api/android/jni/native/convert_util_jni.h
+++ b/lite/api/android/jni/native/convert_util_jni.h
@@ -181,6 +181,7 @@ inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
  MobileConfig config;

  // set model dir
+  // NOTE: This is a deprecated API and will be removed in latter release.
  jmethodID model_dir_method = env->GetMethodID(
      mobileconfig_jclazz, "getModelDir", "()Ljava/lang/String;");
  jstring java_model_dir =
@@ -190,6 +191,27 @@ inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
    config.set_model_dir(cpp_model_dir);
  }

+  // set model from file
+  jmethodID model_file_method = env->GetMethodID(
+      mobileconfig_jclazz, "getModelFromFile", "()Ljava/lang/String;");
+  jstring java_model_file =
+      (jstring)env->CallObjectMethod(jmobileconfig, model_file_method);
+  if (java_model_file != nullptr) {
+    std::string cpp_model_file = jstring_to_cpp_string(env, java_model_file);
+    config.set_model_from_file(cpp_model_file);
+  }
+
+  // set model from buffer
+  jmethodID model_buffer_method = env->GetMethodID(
+      mobileconfig_jclazz, "getModelFromBuffer", "()Ljava/lang/String;");
+  jstring java_model_buffer =
+      (jstring)env->CallObjectMethod(jmobileconfig, model_buffer_method);
+  if (java_model_buffer != nullptr) {
+    std::string cpp_model_buffer =
+        jstring_to_cpp_string(env, java_model_buffer);
+    config.set_model_from_buffer(cpp_model_buffer);
+  }
+
  // set threads
  jmethodID threads_method =
      env->GetMethodID(mobileconfig_jclazz, "getThreads", "()I");

--- a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
+++ b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
@@ -64,6 +64,44 @@ public class MobileConfig extends ConfigBase {
        return powerMode.value();
    }

+    /**
+     * Set model from file.
+     *
+     * @return
+     */
+    public void setModelFromFile(String modelFile) {
+        this.liteModelFile = modelFile;
+    }
+
+    /**
+     * Returns name of model_file.
+     *  
+     * @return liteModelFile
+     */
+    public String getModelFile() {
+        return liteModelFile;
+    }
+
+    /**
+     * Set model from buffer.
+     *
+     * @return
+     */
+    public void setModelFromBuffer(String modelBuffer) {
+        this.liteModelBuffer = modelBuffer;
+    }
+
+    /**
+     * Returns model buffer
+     *  
+     * @return liteModelBuffer
+     */
+    public String getModelBuffer() {
+        return liteModelBuffer;
+    }
+
    private PowerMode powerMode = PowerMode.LITE_POWER_HIGH;
    private int threads = 1;
+    private String liteModelFile;
+    private String liteModelBuffer;
 }
--- a/lite/api/apis_test.cc
+++ b/lite/api/apis_test.cc
@@ -62,7 +62,7 @@ TEST(CXXApi_LightApi, optim_model) {

 TEST(CXXApi_LightApi, save_and_load_model) {
  lite::Predictor cxx_api;
-  lite::LightPredictor light_api(FLAGS_optimized_model);
+  lite::LightPredictor light_api(FLAGS_optimized_model + ".nb", false);

  // CXXAPi
  {

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -116,7 +116,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
  lite_api::MobileConfig config;
  config.set_threads(FLAGS_threads);
  config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");

  auto predictor = lite_api::CreatePaddlePredictor(config);


--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -43,6 +43,7 @@ void Predictor::SaveModel(const std::string &dir,
      LOG(FATAL) << "Unknown model type";
  }
  if (record_info) {
+    MkDirRecur(dir);
    SaveOpKernelInfo(dir);
  }
 }
@@ -121,6 +122,7 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
            << kpf_path;
 }

+#ifndef LITE_WITH_FPGA
 lite::Tensor *Predictor::GetInput(size_t offset) {
  CHECK(input_names_.size() > offset)
      << "The network has " << input_names_.size() << " inputs"
@@ -130,6 +132,17 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
                << " in exec_scope";
  return in_var->GetMutable<lite::Tensor>();
 }
+#else
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = exec_scope_->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+#endif

 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
@@ -167,6 +180,8 @@ void Predictor::PrepareFeedFetch() {
  }
 }

+#ifndef LITE_WITH_FPGA
+
 const lite::Tensor *Predictor::GetOutput(size_t offset) const {
  CHECK(output_names_.size() > offset)
      << "The network has " << output_names_.size() << " outputs"
@@ -186,6 +201,29 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
  }
  return outputs;
 }
+#else
+
+const lite::Tensor *Predictor::GetOutput(size_t offset) const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
+  auto *_fetch_list = exec_scope_->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+
+  std::vector<const lite::Tensor *> outputs;
+  for (auto out : fetch_list) {
+    outputs.push_back(&out);
+  }
+  return outputs;
+}
+
+#endif

 const cpp::ProgramDesc &Predictor::program_desc() const {
  return program_desc_;
@@ -239,7 +277,7 @@ void Predictor::Build(const std::string &model_path,
    case lite_api::LiteModelType::kNaiveBuffer:
      CHECK(!model_path.empty())
          << "NaiveBuffer backend only supported combined param";
-      LoadModelNaive(model_path, scope_.get(), &program_desc_);
+      LoadModelNaiveFromFile(model_path, scope_.get(), &program_desc_);
      break;
    default:
      LOG(FATAL) << "Unknown model type";

--- a/lite/api/cxx_api_test.cc
+++ b/lite/api/cxx_api_test.cc
@@ -101,7 +101,7 @@ TEST(CXXApi, save_model) {
 TEST(CXXApi, load_model_naive) {
  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kARM), PRECISION(kFloat)}});
-  predictor.Build(FLAGS_optimized_model + ".naive",
+  predictor.Build(FLAGS_optimized_model + ".naive.nb",
                  "",
                  "",
                  valid_places,

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -18,6 +18,17 @@
 namespace paddle {
 namespace lite {

+void LightPredictor::Build(const std::string& lite_model_file,
+                           bool model_from_memory) {
+  if (model_from_memory) {
+    LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
+  } else {
+    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
+  }
+  BuildRuntimeProgram(cpp_program_desc_);
+  PrepareFeedFetch();
+}
+
 void LightPredictor::Build(const std::string& model_dir,
                           const std::string& model_buffer,
                           const std::string& param_buffer,

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -18,6 +18,7 @@
 */
 #pragma once

+#include <algorithm>
 #include <map>
 #include <memory>
 #include <string>
@@ -39,12 +40,22 @@ namespace lite {
 */
 class LITE_API LightPredictor {
 public:
-  LightPredictor(
-      const std::string& model_dir,
-      const std::string& model_buffer = "",
-      const std::string& param_buffer = "",
-      bool model_from_memory = false,
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf) {
+  // constructor function of LightPredictor, `lite_model_file` refers to data in
+  // model file or buffer,`model_from_memory` refers to whther to load model
+  // from memory.
+  LightPredictor(const std::string& lite_model_file,
+                 bool model_from_memory = false) {
+    scope_ = std::make_shared<Scope>();
+    Build(lite_model_file, model_from_memory);
+  }
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
+  LightPredictor(const std::string& model_dir,
+                 const std::string& model_buffer = "",
+                 const std::string& param_buffer = "",
+                 bool model_from_memory = false,
+                 lite_api::LiteModelType model_type =
+                     lite_api::LiteModelType::kNaiveBuffer) {
    scope_ = std::make_shared<Scope>();
    Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
  }
@@ -69,6 +80,10 @@ class LITE_API LightPredictor {
  void PrepareFeedFetch();

 private:
+  void Build(const std::string& lite_model_file,
+             bool model_from_memory = false);
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
  void Build(
      const std::string& model_dir,
      const std::string& model_buffer,

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -23,13 +23,17 @@ namespace lite {

 void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  // LightPredictor Only support NaiveBuffer backend in publish lib
-  raw_predictor_.reset(
-      new LightPredictor(config.model_dir(),
-                         config.model_buffer(),
-                         config.param_buffer(),
-                         config.model_from_memory(),
-                         lite_api::LiteModelType::kNaiveBuffer));
-
+  if (config.lite_model_file().empty()) {
+    raw_predictor_.reset(
+        new LightPredictor(config.model_dir(),
+                           config.model_buffer(),
+                           config.param_buffer(),
+                           config.model_from_memory(),
+                           lite_api::LiteModelType::kNaiveBuffer));
+  } else {
+    raw_predictor_.reset(new LightPredictor(config.lite_model_file(),
+                                            config.model_from_memory()));
+  }
  mode_ = config.power_mode();
  threads_ = config.threads();
 }

--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -23,6 +23,10 @@
 #include "lite/core/op_registry.h"

 DEFINE_string(optimized_model, "", "optimized_model");
+DEFINE_int32(N, 1, "input_batch");
+DEFINE_int32(C, 3, "input_channel");
+DEFINE_int32(H, 224, "input_height");
+DEFINE_int32(W, 224, "input_width");

 namespace paddle {
 namespace lite {
@@ -37,7 +41,8 @@ void TestModel(const std::vector<Place>& valid_places,
  predictor.Build(model_dir, "", "", valid_places);

  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W})));
  auto* data = input_tensor->mutable_data<float>();
  auto item_size = input_tensor->dims().production();
  for (int i = 0; i < item_size; i++) {
@@ -58,6 +63,8 @@ void TestModel(const std::vector<Place>& valid_places,
    predictor.SaveModel(FLAGS_optimized_model);
  }

+  LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " "
+            << FLAGS_H << " " << FLAGS_W;
  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
@@ -123,10 +130,10 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
  std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
-      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
      TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
  });


--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -23,6 +23,10 @@
 #include "lite/core/op_registry.h"

 DEFINE_string(optimized_model, "", "optimized_model");
+DEFINE_int32(N, 1, "input_batch");
+DEFINE_int32(C, 3, "input_channel");
+DEFINE_int32(H, 224, "input_height");
+DEFINE_int32(W, 224, "input_width");

 namespace paddle {
 namespace lite {
@@ -38,7 +42,8 @@ void TestModel(const std::vector<Place>& valid_places,
  predictor.Build(model_dir, "", "", valid_places);

  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W})));
  auto* data = input_tensor->mutable_data<float>();
  auto item_size = input_tensor->dims().production();
  for (int i = 0; i < item_size; i++) {
@@ -59,6 +64,8 @@ void TestModel(const std::vector<Place>& valid_places,
    predictor.SaveModel(FLAGS_optimized_model);
  }

+  LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " "
+            << FLAGS_H << " " << FLAGS_W;
  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
@@ -123,8 +130,11 @@ TEST(MobileNetV2, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV2, test_opencl) {
  std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
  });

  TestModel(valid_places);

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -73,7 +73,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
         const int repeat,
         const int warmup_times = 0) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);


--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #endif
 // "supported_kernel_op_info.h", "all_kernel_faked.cc" and "kernel_src_map.h"
-// are created automatically during model_optimize_tool's compiling period
+// are created automatically during opt's compiling period
 #include <iomanip>
 #include "all_kernel_faked.cc"  // NOLINT
 #include "kernel_src_map.h"     // NOLINT
@@ -26,9 +26,11 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/core/op_registry.h"
+#include "lite/core/version.h"
 #include "lite/model_parser/compatible_pb.h"
 #include "lite/model_parser/pb/program_desc.h"
 #include "lite/utils/cp_logging.h"
+#include "lite/utils/io.h"
 #include "lite/utils/string.h"
 #include "supported_kernel_op_info.h"  // NOLINT

@@ -89,13 +91,13 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(TARGET(kARM));
    } else if (target_repr == "opencl") {
      valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
-      valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)});
      valid_places.emplace_back(
          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
      valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
      valid_places.emplace_back(
          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
    } else if (target_repr == "x86") {
@@ -239,6 +241,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
 /// Print help information
 void PrintHelpInfo() {
  // at least one argument should be inputed
+  const std::string opt_version = lite::version();
  const char help_info[] =
      "At least one argument should be inputed. Valid arguments are listed "
      "below:\n"
@@ -260,7 +263,8 @@ void PrintHelpInfo() {
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
      "  Display operators in the input model\n";
-  std::cout << help_info << std::endl;
+  std::cout << "opt version:" << opt_version << std::endl
+            << help_info << std::endl;
  exit(1);
 }

@@ -397,6 +401,7 @@ void Main() {
    return;
  }

+  lite::MkDirRecur(FLAGS_optimize_out);
  auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true);
  if (model_dirs.size() == 0) {
    LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model";
@@ -451,7 +456,9 @@ int main(int argc, char** argv) {
  }
  google::ParseCommandLineFlags(&argc, &argv, false);
  paddle::lite_api::ParseInputCommand();
-  paddle::lite_api::CheckIfModelSupported();
+  if (FLAGS_model_set_dir == "") {
+    paddle::lite_api::CheckIfModelSupported();
+  }
  paddle::lite_api::Main();
  return 0;
 }
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -190,5 +190,27 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }

+// set model data in combined format, `set_model_from_file` refers to loading
+// model from file, set_model_from_buffer refers to loading model from memory
+// buffer
+void MobileConfig::set_model_from_file(const std::string &x) {
+  lite_model_file_ = x;
+}
+void MobileConfig::set_model_from_buffer(const std::string &x) {
+  lite_model_file_ = x;
+  model_from_memory_ = true;
+}
+void MobileConfig::set_model_buffer(const char *model_buffer,
+                                    size_t model_buffer_size,
+                                    const char *param_buffer,
+                                    size_t param_buffer_size) {
+  LOG(WARNING) << "warning: `set_model_buffer` will be abandened in "
+                  "release/v3.0.0, new method `set_model_from_buffer(const "
+                  "std::string &x)` is recommended.";
+  model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
+  param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size);
+  model_from_memory_ = true;
+}
+
 }  // namespace lite_api
 }  // namespace paddle
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -168,22 +168,40 @@ class LITE_API CxxConfig : public ConfigBase {
 /// MobileConfig is the config for the light weight predictor, it will skip
 /// IR optimization or other unnecessary stages.
 class LITE_API MobileConfig : public ConfigBase {
+  // whether to load data from memory. Model data will be loaded from memory
+  // buffer if model_from_memory_ is true.
+  bool model_from_memory_{false};
+
+  // model data readed from file or memory buffer in combined format.
+  std::string lite_model_file_;
+
+  // NOTE: This is a deprecated variable and will be removed in latter release.
  std::string model_buffer_;
  std::string param_buffer_;
-  bool model_from_memory_{false};

 public:
+  // set model data in combined format, `set_model_from_file` refers to loading
+  // model from file, set_model_from_buffer refers to loading model from memory
+  // buffer
+  void set_model_from_file(const std::string& x);
+  void set_model_from_buffer(const std::string& x);
+  // return model data in lite_model_file_, which is in combined format.
+  const std::string& lite_model_file() const { return lite_model_file_; }
+
+  // return model_from_memory_, which indicates whether to load model from
+  // memory buffer.
+  bool model_from_memory() const { return model_from_memory_; }
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
  void set_model_buffer(const char* model_buffer,
                        size_t model_buffer_size,
                        const char* param_buffer,
-                        size_t param_buffer_size) {
-    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
-    param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size);
-    model_from_memory_ = true;
-  }
+                        size_t param_buffer_size);

-  bool model_from_memory() const { return model_from_memory_; }
+  // NOTE: This is a deprecated API and will be removed in latter release.
  const std::string& model_buffer() const { return model_buffer_; }
+
+  // NOTE: This is a deprecated API and will be removed in latter release.
  const std::string& param_buffer() const { return param_buffer_; }
 };


--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -72,7 +72,7 @@ TEST(CxxApi, run) {
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(LightApi, run) {
  lite_api::MobileConfig config;
-  config.set_model_dir(FLAGS_model_dir + ".opt2.naive");
+  config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb");

  auto predictor = lite_api::CreatePaddlePredictor(config);

@@ -109,16 +109,11 @@ TEST(LightApi, run) {
 // Demo2 for Loading model from memory
 TEST(MobileConfig, LoadfromMemory) {
  // Get naive buffer
-  auto model_path = std::string(FLAGS_model_dir) + ".opt2.naive/__model__.nb";
-  auto params_path = std::string(FLAGS_model_dir) + ".opt2.naive/param.nb";
-  std::string model_buffer = lite::ReadFile(model_path);
-  size_t size_model = model_buffer.length();
-  std::string params_buffer = lite::ReadFile(params_path);
-  size_t size_params = params_buffer.length();
+  auto model_file = std::string(FLAGS_model_dir) + ".opt2.naive.nb";
+  std::string model_buffer = lite::ReadFile(model_file);
  // set model buffer and run model
  lite_api::MobileConfig config;
-  config.set_model_buffer(
-      model_buffer.c_str(), size_model, params_buffer.c_str(), size_params);
+  config.set_model_from_buffer(model_buffer);

  auto predictor = lite_api::CreatePaddlePredictor(config);
  auto input_tensor = predictor->GetInput(0);

--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -116,6 +116,8 @@ void BindLiteMobileConfig(py::module *m) {
  py::class_<MobileConfig> mobile_config(*m, "MobileConfig");

  mobile_config.def(py::init<>())
+      .def("set_model_from_file", &MobileConfig::set_model_from_file)
+      .def("set_model_from_buffer", &MobileConfig::set_model_from_buffer)
      .def("set_model_dir", &MobileConfig::set_model_dir)
      .def("model_dir", &MobileConfig::model_dir)
      .def("set_model_buffer", &MobileConfig::set_model_buffer)

--- a/lite/api/resnet50_test_fpga.cc
+++ b/lite/api/resnet50_test_fpga.cc
@@ -31,11 +31,7 @@ TEST(ResNet50, test) {
  std::vector<Place> valid_places(
      {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}});

-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
-                  valid_places);
+  predictor.Build(FLAGS_model_dir, "", "", valid_places);

  auto* input_tensor = predictor.GetInput(0);
  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));

--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -78,6 +78,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      conv3x3s2_depthwise_int8.cc
      conv5x5s1_depthwise_int8.cc
      conv5x5s1_depthwise_fp32.cc
+      conv5x5s2_depthwise_int8.cc
      conv5x5s2_depthwise_fp32.cc
      conv3x3_winograd_fp32_c4.cc
      conv_winograd_3x3.cc

--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -122,8 +122,7 @@ void conv_compute_6x6_3x3(const float* input,

  // begin compute
  for (int ni = 0; ni < num; ++ni) {
-// trans input to c4
-#pragma omp parallel for num_threads(threads)
+    // trans input to c4
    for (int i = 0; i < ic_4; ++i) {
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
@@ -410,8 +409,7 @@ void conv_compute_2x2_3x3(const float* input,

  // begin compute
  for (int ni = 0; ni < num; ++ni) {
-// trans input to c4
-#pragma omp parallel for num_threads(threads)
+    // trans input to c4
    for (int i = 0; i < ic_4; ++i) {
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
@@ -672,9 +670,8 @@ void conv_compute_2x2_3x3_small(const float* input,

  // begin compute
  for (int ni = 0; ni < num; ++ni) {
-// trans input to c4
+    // trans input to c4

-#pragma omp parallel for num_threads(threads)
    for (int i = 0; i < ic_4; ++i) {
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,

--- a/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
@@ -109,7 +109,7 @@ void conv_depthwise_5x5s1_fp32(float* dout,
            tmp_din + omp_get_thread_num() * (pre_in_size + pre_out_size);
        float* pre_out = pre_din + pre_in_size;
 #else
-        float pre_din = tmp_din;
+        float* pre_din = tmp_din;
        float* pre_out = pre_din + pre_in_size;
 #endif
        prepack_input_nxwc4_dw(

--- a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_depthwise.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+#define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
+
+template <typename Dtype>
+void conv_depthwise_5x5s2_int8(Dtype* dout,
+                               const int8_t* din,
+                               const int8_t* weights,
+                               const float* scale,
+                               const float* bias,
+                               bool flag_bias,
+                               bool flag_relu,
+                               int num,
+                               int chin,
+                               int hin,
+                               int win,
+                               int hout,
+                               int wout,
+                               int padw,
+                               int padh,
+                               ARMContext* ctx) {
+  const int threads = ctx->threads();
+  int llc_size = ctx->llc_size() / 4;
+
+  const int hout_c_block = 8;
+  const int hout_r_kernel = 1;
+  const int wout_block = 4;
+  const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block;
+  const int win_round = wout_round * 2 + 3;
+
+  //! get h block
+  //! llc_size = threads * win_round * hout_c_block * hin_r_block *
+  //! sizeof(int8_t)
+  //! + wout_round * hout_c_block * hout_r_block * threads * sizeof(int32_t)
+  //! win_round = wout_round * 2 + 3
+  //! hin_r_block = hout_r_block * 2 + 3
+  int hout_r_block = (llc_size - 3 * win_round * hout_c_block * threads) /
+                     (2 * win_round * hout_c_block * threads +
+                      hout_c_block * wout_round * threads * 4);
+  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
+  hout_r_block =
+      ((hout_r_block + hout_r_kernel - 1) / hout_r_kernel) * hout_r_kernel;
+  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
+
+  const int hin_r_block = hout_r_block * 2 + 3;
+
+  auto tmp_work_space = ctx->workspace_data<int8_t>();
+  int8_t ptr_zero[win_round];  // NOLINT
+  memset(ptr_zero, 0, sizeof(int8_t) * win_round);
+  Dtype ptr_write[wout_round];  // NOLINT
+
+  int in_len = win_round * hout_c_block;
+  int pre_in_size = hin_r_block * in_len;
+  pre_in_size = ROUNDUP(pre_in_size, 4);
+  int pre_out_size = hout_c_block * hout_r_block * wout_round;
+
+  int8_t* tmp_din = tmp_work_space;
+
+  int size_in_channel = win * hin;
+  int size_out_channel = wout * hout;
+  int w_stride = 25;  // kernel_w * kernel_h;
+
+  int ws = -padw;
+  int we = ws + win_round;
+  int w_loop = wout_round / 4;
+  int chout = chin;
+
+  int out_row_stride = hout_c_block * wout_round;
+  for (int n = 0; n < num; ++n) {
+    const int8_t* din_batch = din + n * chin * size_in_channel;
+    int8_t* dout_batch = reinterpret_cast<int8_t*>(dout) +
+                         n * chout * size_out_channel * sizeof(Dtype);
+    for (int h = 0; h < hout; h += hout_r_block) {
+      int h_kernel = hout_r_block;
+      if (h + hout_r_block > hout) {
+        h_kernel = hout - h;
+      }
+      int hs = h - padh;
+      int he = hs + h_kernel * 2 + 3;
+
+#pragma omp parallel for num_threads(threads)
+      for (int c = 0; c < chout; c += hout_c_block) {
+#ifdef ARM_WITH_OMP
+        int8_t* pre_din =
+            tmp_din + omp_get_thread_num() * (pre_in_size + pre_out_size * 4);
+        int32_t* pre_out = reinterpret_cast<int*>(pre_din + pre_in_size);
+#else
+        int32_t* pre_out = reinterpret_cast<int32_t*>(tmp_din + pre_in_size);
+        auto pre_din = tmp_din;
+#endif
+        prepack_input_nxwc8_int8_dw(
+            din_batch, pre_din, c, hs, he, ws, we, chin, win, hin);
+
+        const int8_t* block_inr0 = pre_din;
+        const int8_t* block_inr1 = block_inr0 + in_len;
+        const int8_t* block_inr2 = block_inr1 + in_len;
+        const int8_t* block_inr3 = block_inr2 + in_len;
+        const int8_t* block_inr4 = block_inr3 + in_len;
+
+        const int8_t* weight_c = weights + c * w_stride;
+        float bias_local[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+        if (flag_bias) {
+          bias_local[0] = bias[c];
+          bias_local[1] = bias[c + 1];
+          bias_local[2] = bias[c + 2];
+          bias_local[3] = bias[c + 3];
+          bias_local[4] = bias[c + 4];
+          bias_local[5] = bias[c + 5];
+          bias_local[6] = bias[c + 6];
+          bias_local[7] = bias[c + 7];
+        }
+        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
+          int cnt = w_loop;
+          const int8_t* inr0 = block_inr0;
+          const int8_t* inr1 = block_inr1;
+          const int8_t* inr2 = block_inr2;
+          const int8_t* inr3 = block_inr3;
+          const int8_t* inr4 = block_inr4;
+
+          int32_t* ptr_out0 = pre_out + hk * out_row_stride;
+// clang-format off
+#ifdef __aarch64__
+          auto wptr = weight_c;
+          asm volatile(
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r0]], #32\n"   /* load r0 0-3 */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r0]], #32\n"   /* load r0 4-7 */
+              "ld1  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[wc]], #32\n" /* load wc 0-3 */
+              "1:\n"
+              /* in r0 */
+              "smull  v20.8h, v0.8b,  v12.8b\n" /* w0, int16, out0 */
+              "smull  v21.8h, v2.8b,  v12.8b\n" /* w0, int16, out1 */
+              "smull  v22.8h, v4.8b,  v12.8b\n" /* w0, int16, out2 */
+              "smull  v23.8h, v6.8b,  v12.8b\n" /* w0, int16, out3 */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r0]]\n" /* load r0 8-11 */
+              "smlal  v20.8h, v1.8b,  v13.8b\n" /* w1, int16, out0 */
+              "smlal  v21.8h, v3.8b,  v13.8b\n" /* w1, int16, out1 */
+              "smlal  v22.8h, v5.8b,  v13.8b\n" /* w1, int16, out2 */
+              "smlal  v23.8h, v7.8b,  v13.8b\n" /* w1, int16, out3 */
+              "sxtl   v24.4s, v20.4h\n" /* mov to out0 low */
+              "sxtl2  v25.4s, v20.8h\n" /* mov to out0 hig */
+              "sxtl   v26.4s, v21.4h\n" /* mov to out1 low */
+              "sxtl2  v27.4s, v21.8h\n" /* mov to out1 hig */
+              "sxtl   v28.4s, v22.4h\n" /* mov to out2 low */
+              "sxtl2  v29.4s, v22.8h\n" /* mov to out2 hig */
+              "sxtl   v30.4s, v23.4h\n" /* mov to out3 low */
+              "sxtl2  v31.4s, v23.8h\n" /* mov to out3 hig */
+              "ld1  {v16.8b, v17.8b, v18.8b, v19.8b}, [%[wc]], #32\n" /* load wc 4-7 */
+
+              "smull  v20.8h, v2.8b,  v14.8b\n" /* w2, int16, out0 */
+              "smull  v21.8h, v4.8b,  v14.8b\n" /* w2, int16, out1 */
+              "smull  v22.8h, v6.8b,  v14.8b\n" /* w2, int16, out2 */
+              "smull  v23.8h, v8.8b,  v14.8b\n" /* w2, int16, out3 */
+              "smlal  v20.8h, v3.8b,  v15.8b\n" /* w3, int16, out0 */
+              "smlal  v21.8h, v5.8b,  v15.8b\n" /* w3, int16, out1 */
+              "smlal  v22.8h, v7.8b,  v15.8b\n" /* w3, int16, out2 */
+              "smlal  v23.8h, v9.8b,  v15.8b\n" /* w3, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r1]], #32\n" /* load r1 0-3 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v4.8b,  v16.8b\n" /* w4, int16, out0 */
+              "smull  v21.8h, v6.8b,  v16.8b\n" /* w4, int16, out1 */
+              "smull  v22.8h, v8.8b,  v16.8b\n" /* w4, int16, out2 */
+              "smull  v23.8h, v10.8b, v16.8b\n" /* w4, int16, out3 */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r1]], #32\n" /* load r1 4-7 */
+              /* in r1 */
+              "smlal  v20.8h, v0.8b,  v17.8b\n" /* w5, int16, out0 */
+              "smlal  v21.8h, v2.8b,  v17.8b\n" /* w5, int16, out1 */
+              "smlal  v22.8h, v4.8b,  v17.8b\n" /* w5, int16, out2 */
+              "smlal  v23.8h, v6.8b,  v17.8b\n" /* w5, int16, out3 */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r1]]\n" /* load r1 8-11 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v1.8b,  v18.8b\n" /* w6, int16, out0 */
+              "smull  v21.8h, v3.8b,  v18.8b\n" /* w6, int16, out1 */
+              "smull  v22.8h, v5.8b,  v18.8b\n" /* w6, int16, out2 */
+              "smull  v23.8h, v7.8b,  v18.8b\n" /* w6, int16, out3 */
+              "ld1  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[wc]], #32\n" /* load wc 8-11 */
+              "smlal  v20.8h, v2.8b,  v19.8b\n" /* w7, int16, out0 */
+              "smlal  v21.8h, v4.8b,  v19.8b\n" /* w7, int16, out1 */
+              "smlal  v22.8h, v6.8b,  v19.8b\n" /* w7, int16, out2 */
+              "smlal  v23.8h, v8.8b,  v19.8b\n" /* w7, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v16.8b, v17.8b, v18.8b, v19.8b}, [%[wc]], #32\n" /* load wc 12-15 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v3.8b,  v12.8b\n" /* w8, int16, out0 */
+              "smull  v21.8h, v5.8b,  v12.8b\n" /* w8, int16, out1 */
+              "smull  v22.8h, v7.8b,  v12.8b\n" /* w8, int16, out2 */
+              "smull  v23.8h, v9.8b,  v12.8b\n" /* w8, int16, out3 */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r2]], #32\n" /* load r2 0-3 */
+              "smlal  v20.8h, v4.8b,  v13.8b\n" /* w9, int16, out0 */
+              "smlal  v21.8h, v6.8b,  v13.8b\n" /* w9, int16, out1 */
+              "smlal  v22.8h, v8.8b,  v13.8b\n" /* w9, int16, out2 */
+              "smlal  v23.8h, v10.8b, v13.8b\n" /* w9, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r2]], #32\n" /* load r2 4-7 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+               /* in r2 */
+              "smull  v20.8h, v0.8b,  v14.8b\n" /* w10, int16, out0 */
+              "smull  v21.8h, v2.8b,  v14.8b\n" /* w10, int16, out1 */
+              "smull  v22.8h, v4.8b,  v14.8b\n" /* w10, int16, out2 */
+              "smull  v23.8h, v6.8b,  v14.8b\n" /* w10, int16, out3 */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r2]]\n" /* load r2 8-11 */
+              "smlal  v20.8h, v1.8b,  v15.8b\n" /* w11, int16, out0 */
+              "smlal  v21.8h, v3.8b,  v15.8b\n" /* w11, int16, out1 */
+              "smlal  v22.8h, v5.8b,  v15.8b\n" /* w11, int16, out2 */
+              "smlal  v23.8h, v7.8b,  v15.8b\n" /* w11, int16, out3 */
+
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[wc]], #32\n" /* load wc 16-19 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v2.8b,  v16.8b\n" /* w12, int16, out0 */
+              "smull  v21.8h, v4.8b,  v16.8b\n" /* w12, int16, out1 */
+              "smull  v22.8h, v6.8b,  v16.8b\n" /* w12, int16, out2 */
+              "smull  v23.8h, v8.8b,  v16.8b\n" /* w12, int16, out3 */
+              "smlal  v20.8h, v3.8b,  v17.8b\n" /* w13, int16, out0 */
+              "smlal  v21.8h, v5.8b,  v17.8b\n" /* w13, int16, out1 */
+              "smlal  v22.8h, v7.8b,  v17.8b\n" /* w13, int16, out2 */
+              "smlal  v23.8h, v9.8b,  v17.8b\n" /* w13, int16, out3 */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r3]], #32\n" /* load r3 0-3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+              "smull  v20.8h, v4.8b,  v18.8b\n" /* w14, int16, out0 */
+              "smull  v21.8h, v6.8b,  v18.8b\n" /* w14, int16, out1 */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r3]], #32\n" /* load r3 4-7 */
+              "smull  v22.8h, v8.8b,  v18.8b\n" /* w14, int16, out2 */
+              "smull  v23.8h, v10.8b, v18.8b\n" /* w14, int16, out3 */
+              /* in r3 */
+              "smlal  v20.8h, v0.8b,  v19.8b\n" /* w15, int16, out0 */
+              "smlal  v21.8h, v2.8b,  v19.8b\n" /* w15, int16, out1 */
+              "smlal  v22.8h, v4.8b,  v19.8b\n" /* w15, int16, out2 */
+              "smlal  v23.8h, v6.8b,  v19.8b\n" /* w15, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r3]]\n" /* load r3 8-11 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v1.8b,  v12.8b\n" /* w16, int16, out0 */
+              "smull  v21.8h, v3.8b,  v12.8b\n" /* w16, int16, out1 */
+              "smull  v22.8h, v5.8b,  v12.8b\n" /* w16, int16, out2 */
+              "smull  v23.8h, v7.8b,  v12.8b\n" /* w16, int16, out3 */
+              "ld1  {v16.8b, v17.8b, v18.8b, v19.8b}, [%[wc]], #32\n" /* load wc 20-23 */
+              "smlal  v20.8h, v2.8b,  v13.8b\n" /* w17, int16, out0 */
+              "smlal  v21.8h, v4.8b,  v13.8b\n" /* w17, int16, out1 */
+              "smlal  v22.8h, v6.8b,  v13.8b\n" /* w17, int16, out2 */
+              "smlal  v23.8h, v8.8b,  v13.8b\n" /* w17, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v3.8b,  v14.8b\n" /* w18, int16, out0 */
+              "smull  v21.8h, v5.8b,  v14.8b\n" /* w18, int16, out1 */
+              "smull  v22.8h, v7.8b,  v14.8b\n" /* w18, int16, out2 */
+              "smull  v23.8h, v9.8b,  v14.8b\n" /* w18, int16, out3 */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r4]], #32\n" /* load r4 0-3 */
+              "smlal  v20.8h, v4.8b,  v15.8b\n" /* w19, int16, out0 */
+              "smlal  v21.8h, v6.8b,  v15.8b\n" /* w19, int16, out1 */
+              "smlal  v22.8h, v8.8b,  v15.8b\n" /* w19, int16, out2 */
+              "smlal  v23.8h, v10.8b, v15.8b\n" /* w19, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r4]], #32\n" /* load r4 4-7 */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              /* in r4 */
+              "smull  v20.8h, v0.8b,  v16.8b\n" /* w20, int16, out0 */
+              "smull  v21.8h, v2.8b,  v16.8b\n" /* w20, int16, out1 */
+              "smull  v22.8h, v4.8b,  v16.8b\n" /* w20, int16, out2 */
+              "smull  v23.8h, v6.8b,  v16.8b\n" /* w20, int16, out3 */
+              "ld1  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[r4]]\n" /* load r4 8-11 */
+              "smlal  v20.8h, v1.8b,  v17.8b\n" /* w21, int16, out0 */
+              "smlal  v21.8h, v3.8b,  v17.8b\n" /* w21, int16, out1 */
+              "smlal  v22.8h, v5.8b,  v17.8b\n" /* w21, int16, out2 */
+              "smlal  v23.8h, v7.8b,  v17.8b\n" /* w21, int16, out3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+              "ld1  {v16.8b}, [%[wc]], #8\n" /* load wc 24 */
+              "smull  v20.8h, v2.8b,  v18.8b\n" /* w22, int16, out0 */
+              "smull  v21.8h, v4.8b,  v18.8b\n" /* w22, int16, out1 */
+              "smull  v22.8h, v6.8b,  v18.8b\n" /* w22, int16, out2 */
+              "smull  v23.8h, v8.8b,  v18.8b\n" /* w22, int16, out3 */
+              "sub    %[wc], %[wc], #200 \n"
+              "smlal  v20.8h, v3.8b,  v19.8b\n" /* w23, int16, out0 */
+              "smlal  v21.8h, v5.8b,  v19.8b\n" /* w23, int16, out1 */
+              "smlal  v22.8h, v7.8b,  v19.8b\n" /* w23, int16, out2 */
+              "smlal  v23.8h, v9.8b,  v19.8b\n" /* w23, int16, out3 */
+              "ld1  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[r0]], #32\n" /* load r0 0-3 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "ld1  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[wc]], #32\n" /* load wc 0-3 */
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+
+              "smull  v20.8h, v4.8b,  v16.8b\n" /* w24, int16, out0 */
+              "smull  v21.8h, v6.8b,  v16.8b\n" /* w24, int16, out1 */
+              "smull  v22.8h, v8.8b,  v16.8b\n" /* w24, int16, out2 */
+              "smull  v23.8h, v10.8b, v16.8b\n" /* w24, int16, out3 */
+              "ld1  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[r0]], #32\n" /* load r0 4-7 */
+              "saddw  v24.4s, v24.4s, v20.4h\n" /* add to out0 low */
+              "saddw2 v25.4s, v25.4s, v20.8h\n" /* add to out0 hig */
+              "saddw  v26.4s, v26.4s, v21.4h\n" /* add to out1 low */
+              "saddw2 v27.4s, v27.4s, v21.8h\n" /* add to out1 hig */
+              "stp    q24, q25, [%[ptr_out0]], #32\n"
+              "saddw  v28.4s, v28.4s, v22.4h\n" /* add to out2 low */
+              "saddw2 v29.4s, v29.4s, v22.8h\n" /* add to out2 hig */
+              "stp    q26, q27, [%[ptr_out0]], #32\n"
+              "saddw  v30.4s, v30.4s, v23.4h\n" /* add to out3 low */
+              "saddw2 v31.4s, v31.4s, v23.8h\n" /* add to out3 hig */
+              "subs   %w[cnt], %w[cnt], #1\n"
+              "stp    q28, q29, [%[ptr_out0]], #32\n"
+              "stp    q30, q31, [%[ptr_out0]], #32\n"
+              "bne    1b\n"
+              : [cnt] "+r"(cnt),
+                [r0] "+r"(inr0),
+                [r1] "+r"(inr1),
+                [r2] "+r"(inr2),
+                [r3] "+r"(inr3),
+                [r4] "+r"(inr4),
+                [wc] "+r"(wptr),
+                [ptr_out0] "+r"(ptr_out0)
+              :
+              : "cc","memory",
+                "v0","v1","v2","v3","v4","v5","v6","v7",
+                "v8","v9","v10","v11","v12","v13",
+                "v14","v15","v16","v17","v18","v19",
+                "v20","v21","v22","v23","v24","v25",
+                "v26","v27","v28","v29","v30","v31"
+              );
+#else
+          auto wptr = weight_c;
+          asm volatile(
+              "vld1.32    {d0-d3}, [%[r0]]!\n"    /* load r0, 0-3 */
+              "vld1.32    {d4-d5}, [%[r0]]!\n"    /* load r0, 4-5 */
+              "vld1.32    {d6-d7},  [%[wptr]]!\n" /* load w0-w1 */
+              "1:\n"
+              /* inr0 */
+              "vmull.s8   q4, d0, d6\n"           /* int16, out0 */
+              "vmull.s8   q5, d2, d6\n"           /* int16, out1 */
+              "vmull.s8   q6, d4, d6\n"           /* int16, out2 */
+              "vmlal.s8   q4, d1, d7\n"           /* int16, out0 */
+              "vld1.32    {d0-d1}, [%[r0]]!\n"    /* load r0, 6-7 */
+              "vmlal.s8   q5, d3, d7\n"           /* int16, out1 */
+              "vmlal.s8   q6, d5, d7\n"           /* int16, out2 */
+              "vmovl.s16  q8, d8\n"               /* mov to out0 low */
+              "vmull.s8   q7, d0, d6\n"           /* int16, out3 */
+              "vmovl.s16  q9, d9\n"               /* mov to out0 hig */
+              "vmovl.s16  q10, d10\n"             /* mov to out1 low */
+              "vmovl.s16  q11, d11\n"             /* mov to out1 hig */
+              "vmlal.s8   q7, d1, d7\n"           /* int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w2-w3 */
+              "vmovl.s16  q12, d12\n"             /* mov to out2 low */
+              "vmovl.s16  q13, d13\n"             /* mov to out2 hig */
+              "vmovl.s16  q14, d14\n"             /* mov to out3 low */
+              "vmovl.s16  q15, d15\n"             /* mov to out3 hig */
+
+              "vmull.s8   q4, d2, d6\n"           /* w2, int16, out0 */
+              "vmull.s8   q5, d4, d6\n"           /* w2, int16, out1 */
+              "vmull.s8   q6, d0, d6\n"           /* w2, int16, out2 */
+              "vmlal.s8   q4, d3, d7\n"           /* w3, int16, out0 */
+              "vld1.32    {d2-d3}, [%[r0]]!\n"    /* load r0, 8-9 */
+              "vmlal.s8   q5, d5, d7\n"           /* w3, int16, out1 */
+              "vmlal.s8   q6, d1, d7\n"           /* w3, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d2, d6\n"           /* w2, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d3, d7\n"           /* w3, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w4-w5 */
+              "vld1.32    {d5}, [%[r0]]\n"        /* load r0, 10 */
+              "sub %[r0], %[r0], #16\n"           /* r0 = r0 - 16 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d4, d6\n"           /* w4, int16, out0 */
+              "vmull.s8   q5, d0, d6\n"           /* w4, int16, out1 */
+              "vmull.s8   q6, d2, d6\n"           /* w4, int16, out2 */
+              "vmull.s8   q7, d5, d6\n"           /* w4, int16, out3 */
+              "vld1.32    {d0-d3}, [%[r1]]!\n"    /* load r1, 0-3 */
+              "vld1.32    {d4-d5}, [%[r1]]!\n"    /* load r1, 4-5 */
+              /* inr1 */
+              "vmlal.s8   q4, d0, d7\n"           /* w5, int16, out0 */
+              "vmlal.s8   q5, d2, d7\n"           /* w5, int16, out1 */
+              "vmlal.s8   q6, d4, d7\n"           /* w5, int16, out2 */
+              "vld1.32    {d0}, [%[r1]]!\n"       /* load r1, 6 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d0, d7\n"           /* w5, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w6-w7 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d1, d6\n"           /* w6, int16, out0 */
+              "vld1.32    {d1}, [%[r1]]!\n"       /* load r1, 7 */
+              "vmull.s8   q5, d3, d6\n"           /* w6, int16, out1 */
+              "vmull.s8   q6, d5, d6\n"           /* w6, int16, out2 */
+              "vmlal.s8   q4, d2, d7\n"           /* w7, int16, out0 */
+              "vmlal.s8   q5, d4, d7\n"           /* w7, int16, out1 */
+              "vmlal.s8   q6, d0, d7\n"           /* w7, int16, out2 */
+              "vmull.s8   q7, d1, d6\n"           /* w6, int16, out3 */
+              "vld1.32    {d2}, [%[r1]]!\n"       /* load r1, 8 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d2, d7\n"           /* w7, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w8-w9 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d3, d6\n"           /* w8, int16, out0 */
+              "vld1.32    {d3}, [%[r1]]!\n"       /* load r1, 9 */
+              "vmull.s8   q5, d5, d6\n"           /* w8, int16, out1 */
+              "vmull.s8   q6, d1, d6\n"           /* w8, int16, out2 */
+              "vld1.32    {d5}, [%[r1]]\n"        /* load r1, 10 */
+              "vmlal.s8   q4, d4, d7\n"           /* w9, int16, out0 */
+              "vmlal.s8   q5, d0, d7\n"           /* w9, int16, out1 */
+              "vmlal.s8   q6, d2, d7\n"           /* w9, int16, out2 */
+              "vmull.s8   q7, d3, d6\n"           /* w8, int16, out3 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d5, d7\n"           /* w9, int16, out3 */
+              "sub %[r1], %[r1], #16\n"           /* r1 = r1 - 16 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w10-w11 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+              "vld1.32    {d0-d3}, [%[r2]]!\n"    /* load r2, 0-3 */
+              "vld1.32    {d4-d5}, [%[r2]]!\n"    /* load r2, 4-5 */
+
+              /* inr2 */
+              "vmull.s8   q4, d0, d6\n"           /* w10, int16, out0 */
+              "vmull.s8   q5, d2, d6\n"           /* w10, int16, out1 */
+              "vmull.s8   q6, d4, d6\n"           /* w10, int16, out2 */
+              "vmlal.s8   q4, d1, d7\n"           /* w11, int16, out0 */
+              "vld1.32    {d0-d1}, [%[r2]]!\n"    /* load r2, 6-7 */
+              "vmlal.s8   q5, d3, d7\n"           /* w11, int16, out1 */
+              "vmlal.s8   q6, d5, d7\n"           /* w11, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d0, d6\n"           /* w10, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d1, d7\n"           /* w11, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w12-w13 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d2, d6\n"           /* w12, int16, out0 */
+              "vmull.s8   q5, d4, d6\n"           /* w12, int16, out1 */
+              "vmull.s8   q6, d0, d6\n"           /* w12, int16, out2 */
+              "vmlal.s8   q4, d3, d7\n"           /* w13, int16, out0 */
+              "vld1.32    {d2-d3}, [%[r2]]!\n"    /* load r2, 8-9 */
+              "vmlal.s8   q5, d5, d7\n"           /* w13, int16, out1 */
+              "vmlal.s8   q6, d1, d7\n"           /* w13, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d2, d6\n"           /* w12, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d3, d7\n"           /* w13, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w14-w15 */
+              "vld1.32    {d5}, [%[r2]]\n"        /* load r2, 10 */
+              "sub %[r2], %[r2], #16\n"           /* r2 = r2 - 16 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d4, d6\n"           /* w14, int16, out0 */
+              "vmull.s8   q5, d0, d6\n"           /* w14, int16, out1 */
+              "vmull.s8   q6, d2, d6\n"           /* w14, int16, out2 */
+              "vmull.s8   q7, d5, d6\n"           /* w14, int16, out3 */
+              "vld1.32    {d0-d3}, [%[r3]]!\n"    /* load r3, 0-3 */
+              "vld1.32    {d4-d5}, [%[r3]]!\n"    /* load r3, 4-5 */
+              /* inr3 */
+              "vmlal.s8   q4, d0, d7\n"           /* w15, int16, out0 */
+              "vmlal.s8   q5, d2, d7\n"           /* w15, int16, out1 */
+              "vmlal.s8   q6, d4, d7\n"           /* w15, int16, out2 */
+              "vld1.32    {d0}, [%[r3]]!\n"       /* load r3, 6 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d0, d7\n"           /* w15, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w16-w17 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d1, d6\n"           /* w16, int16, out0 */
+              "vld1.32    {d1}, [%[r3]]!\n"       /* load r3, 7 */
+              "vmull.s8   q5, d3, d6\n"           /* w16, int16, out1 */
+              "vmull.s8   q6, d5, d6\n"           /* w16, int16, out2 */
+              "vmlal.s8   q4, d2, d7\n"           /* w17, int16, out0 */
+              "vmlal.s8   q5, d4, d7\n"           /* w17, int16, out1 */
+              "vmlal.s8   q6, d0, d7\n"           /* w17, int16, out2 */
+              "vmull.s8   q7, d1, d6\n"           /* w16, int16, out3 */
+              "vld1.32    {d2}, [%[r3]]!\n"       /* load r3, 8 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d2, d7\n"           /* w17, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w18-w19 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d3, d6\n"           /* w18, int16, out0 */
+              "vld1.32    {d3}, [%[r3]]!\n"       /* load r3, 9 */
+              "vmull.s8   q5, d5, d6\n"           /* w18, int16, out1 */
+              "vmull.s8   q6, d1, d6\n"           /* w18, int16, out2 */
+              "vld1.32    {d5}, [%[r3]]\n"        /* load r3, 10 */
+              "vmlal.s8   q4, d4, d7\n"           /* w19, int16, out0 */
+              "vmlal.s8   q5, d0, d7\n"           /* w19, int16, out1 */
+              "vmlal.s8   q6, d2, d7\n"           /* w19, int16, out2 */
+              "vmull.s8   q7, d3, d6\n"           /* w18, int16, out3 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d5, d7\n"           /* w19, int16, out3 */
+              "sub %[r3], %[r3], #16\n"           /* r3 = r3 - 16 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w20-w21 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+              "vld1.32    {d0-d3}, [%[r4]]!\n"    /* load r4, 0-3 */
+              "vld1.32    {d4-d5}, [%[r4]]!\n"    /* load r4, 4-5 */
+
+              /* inr4 */
+              "vmull.s8   q4, d0, d6\n"           /* w20, int16, out0 */
+              "vmull.s8   q5, d2, d6\n"           /* w20, int16, out1 */
+              "vmull.s8   q6, d4, d6\n"           /* w20, int16, out2 */
+              "vmlal.s8   q4, d1, d7\n"           /* w21, int16, out0 */
+              "vld1.32    {d0-d1}, [%[r4]]!\n"    /* load r4, 6-7 */
+              "vmlal.s8   q5, d3, d7\n"           /* w21, int16, out1 */
+              "vmlal.s8   q6, d5, d7\n"           /* w21, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d0, d6\n"           /* w20, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d1, d7\n"           /* w21, int16, out3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w22-w23 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+
+              "vmull.s8   q4, d2, d6\n"           /* w22, int16, out0 */
+              "vmull.s8   q5, d4, d6\n"           /* w22, int16, out1 */
+              "vmull.s8   q6, d0, d6\n"           /* w22, int16, out2 */
+              "vmlal.s8   q4, d3, d7\n"           /* w23, int16, out0 */
+              "vld1.32    {d2-d3}, [%[r4]]!\n"    /* load r4, 7-8 */
+              "vmlal.s8   q5, d5, d7\n"           /* w23, int16, out1 */
+              "vmlal.s8   q6, d1, d7\n"           /* w23, int16, out2 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vmull.s8   q7, d2, d6\n"           /* w22, int16, out3 */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vmlal.s8   q7, d3, d7\n"           /* w23, int16, out3 */
+              "vld1.32    {d6}, [%[wptr]]!\n"     /* load w24 */
+              "vld1.32    {d5}, [%[r4]]\n"        /* load r4, 10 */
+              "sub %[r4], %[r4], #16\n"           /* r4 = r4 - 16 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+              "sub %[wptr], %[wptr], #200 \n"     /*  wptr = wptr - 200 */
+
+              "vmull.s8   q4, d4, d6\n"           /* w22, int16, out0 */
+              "vmull.s8   q5, d0, d6\n"           /* w22, int16, out1 */
+              "vmull.s8   q6, d2, d6\n"           /* w22, int16, out2 */
+              "vmull.s8   q7, d5, d6\n"           /* w22, int16, out3 */
+              "vld1.32    {d0-d3}, [%[r0]]!\n"    /* load r0, 0-3 */
+              "vld1.32    {d6-d7}, [%[wptr]]!\n"  /* load w0-w1 */
+              "vaddw.s16  q8, q8, d8\n"           /* add to out0 low */
+              "vaddw.s16  q9, q9, d9\n"           /* add to out0 hig */
+              "vld1.32    {d4-d5}, [%[r0]]!\n"    /* load r0, 0-3 */
+              "vaddw.s16  q10, q10, d10\n"        /* add to out1 low */
+              "vaddw.s16  q11, q11, d11\n"        /* add to out1 hig */
+              "vst1.32    {d16-d19},  [%[ptr_out0]]!\n"/* store out0 */
+              "vaddw.s16  q12, q12, d12\n"        /* add to out2 low */
+              "vaddw.s16  q13, q13, d13\n"        /* add to out2 hig */
+              "vst1.32    {d20-d23},  [%[ptr_out0]]!\n"/*store out1 */
+              "vaddw.s16  q14, q14, d14\n"        /* add to out3 low */
+              "vaddw.s16  q15, q15, d15\n"        /* add to out3 hig */
+              "subs       %[cnt], #1\n"           /* cnt = cnt - 1 */
+              "vst1.32    {d24-d27},  [%[ptr_out0]]!\n"/* store out2 */
+              "vst1.32    {d28-d31},  [%[ptr_out0]]!\n"/* store out3 */
+              "bne 1b\n"                          /* branch main loop */
+              : [cnt] "+r"(cnt),
+                [r0] "+r"(inr0),
+                [r1] "+r"(inr1),
+                [r2] "+r"(inr2),
+                [r3] "+r"(inr3),
+                [r4] "+r"(inr4),
+                [ptr_out0] "+r"(ptr_out0),
+                [wptr] "+r"(wptr)
+              :
+              : "cc",
+                "memory",
+                "q0",
+                "q1",
+                "q2",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14",
+                "q15");
+#endif
+          // clang-format on
+          block_inr0 = block_inr2;
+          block_inr1 = block_inr3;
+          block_inr2 = block_inr4;
+          block_inr3 = block_inr2 + in_len;
+          block_inr4 = block_inr3 + in_len;
+        }
+        write_int32_nchwc8_to_nchw<Dtype>(pre_out,
+                                          reinterpret_cast<Dtype*>(dout_batch),
+                                          c,
+                                          c + hout_c_block,
+                                          h,
+                                          h + h_kernel,
+                                          0,
+                                          wout_round,
+                                          chout,
+                                          hout,
+                                          wout,
+                                          flag_relu,
+                                          bias_local,
+                                          flag_bias,
+                                          ptr_write,
+                                          scale + c);
+      }
+    }
+  }
+}
+
+template void conv_depthwise_5x5s2_int8<int8_t>(int8_t* dout,
+                                                const int8_t* din,
+                                                const int8_t* weights,
+                                                const float* scale,
+                                                const float* bias,
+                                                bool flag_bias,
+                                                bool flag_relu,
+                                                int num,
+                                                int chin,
+                                                int hin,
+                                                int win,
+                                                int hout,
+                                                int wout,
+                                                int padw,
+                                                int padh,
+                                                ARMContext* ctx);
+
+template void conv_depthwise_5x5s2_int8<float>(float* dout,
+                                               const int8_t* din,
+                                               const int8_t* weights,
+                                               const float* scale,
+                                               const float* bias,
+                                               bool flag_bias,
+                                               bool flag_relu,
+                                               int num,
+                                               int chin,
+                                               int hin,
+                                               int win,
+                                               int hout,
+                                               int wout,
+                                               int padw,
+                                               int padh,
+                                               ARMContext* ctx);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -189,6 +189,24 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
                               int padh,
                               ARMContext* ctx);

+template <typename Dtype>
+void conv_depthwise_5x5s2_int8(Dtype* dout,
+                               const int8_t* din,
+                               const int8_t* weights,
+                               const float* scale,
+                               const float* bias,
+                               bool flag_bias,
+                               bool flag_relu,
+                               int num,
+                               int chin,
+                               int hin,
+                               int win,
+                               int hout,
+                               int wout,
+                               int padw,
+                               int padh,
+                               ARMContext* ctx);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -880,6 +880,23 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
                              pad_w,
                              pad_h,
                              ctx);
+  } else if (stride == 2) {
+    conv_depthwise_5x5s2_int8(reinterpret_cast<float*>(dout),
+                              reinterpret_cast<const int8_t*>(din),
+                              reinterpret_cast<const int8_t*>(weights),
+                              scale,
+                              bias,
+                              flag_bias,
+                              flag_relu,
+                              num,
+                              ch_in,
+                              h_in,
+                              w_in,
+                              h_out,
+                              w_out,
+                              pad_w,
+                              pad_h,
+                              ctx);
  } else {
    LOG(FATAL) << "unsupport this type 5x5 dw conv int8";
  }
@@ -922,6 +939,23 @@ void conv_depthwise_5x5_int8_int8(const void* din,
                              pad_w,
                              pad_h,
                              ctx);
+  } else if (stride == 2) {
+    conv_depthwise_5x5s2_int8(reinterpret_cast<int8_t*>(dout),
+                              reinterpret_cast<const int8_t*>(din),
+                              reinterpret_cast<const int8_t*>(weights),
+                              scale,
+                              bias,
+                              flag_bias,
+                              flag_relu,
+                              num,
+                              ch_in,
+                              h_in,
+                              w_in,
+                              h_out,
+                              w_out,
+                              pad_w,
+                              pad_h,
+                              ctx);
  } else {
    LOG(FATAL) << "unsupport this type 5x5 dw conv int8";
  }

--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -922,7 +922,7 @@ void sgemv_trans(const int M,
  /* end */                                                 \
  "4:                           \n" /* end */               \
  "fmov   s1, %w[alpha]         \n" /* mov alpha to s1  */  \
-  "fcmp   s8, #0                \n" /* cmp with zero*/      \
+  "fcmp   s8, #0.0              \n" /* cmp with zero*/      \
  "bge    5f                    \n" /* if ge zero */        \
  "fmul   s8, s8, s1            \n" /* out * alpha */       \
  "5:                           \n" /* leakey relu label */ \

--- a/lite/backends/arm/math/type_trans.cc
+++ b/lite/backends/arm/math/type_trans.cc
@@ -46,6 +46,7 @@ void fp32_to_int8(const float* din,
    float inv_scale = 1.f / scale[j % axis_size];
    float32x4_t vzero = vdupq_n_f32(0.f);
    float32x4_t vscale = vdupq_n_f32(inv_scale);
+    float32x4_t vmax = vdupq_n_f32(-127.f);
    float32x4_t vpoff = vdupq_n_f32(0.5f);
    float32x4_t vnoff = vdupq_n_f32(-0.5f);
    const float* din_c = din + j * inner_size;
@@ -63,6 +64,14 @@ void fp32_to_int8(const float* din,
          "fmul v5.4s, v1.4s, %[scale].4s             \n"
          "fmul v6.4s, v2.4s, %[scale].4s             \n"
          "fmul v7.4s, v3.4s, %[scale].4s             \n"
+          "fcmge v8.4s, v4.4s, %[vmax].4s             \n"
+          "fcmge v9.4s, v5.4s, %[vmax].4s             \n"
+          "fcmge v10.4s, v6.4s, %[vmax].4s            \n"
+          "fcmge v11.4s, v7.4s, %[vmax].4s            \n"
+          "bif v4.16b, %[vmax].16b, v8.16b            \n"
+          "bif v5.16b, %[vmax].16b, v9.16b            \n"
+          "bif v6.16b, %[vmax].16b, v10.16b            \n"
+          "bif v7.16b, %[vmax].16b, v11.16b            \n"
          "ldp q0, q1, [%[in]], #32                   \n"
          "subs %[cnt], %[cnt], #1                    \n"
          "FCVTAS v8.4s, v4.4s                        \n"
@@ -79,7 +88,7 @@ void fp32_to_int8(const float* din,
          "str q8, [%[out]], #16                      \n"
          "bne    0b                                  \n"
          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
-          : [scale] "w"(vscale)
+          : [scale] "w"(vscale), [vmax] "w"(vmax)
          : "v0",
            "v1",
            "v2",
@@ -104,15 +113,23 @@ void fp32_to_int8(const float* din,
          "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
          "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
          "vcgt.f32   q10, q2, %q[vzero]          @ get mask > 0, in2\n"
-          "vcgt.f32   q11, q3, %q[vzero]          @ get mask > 0, in3\n"
          "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
+          "vcgt.f32   q8, q3, %q[vzero]          @ get mask > 0, in3\n"
          "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
          "vbif.f32   q6, %q[vnoff], q10          @ get right offset\n"
-          "vbif.f32   q7, %q[vnoff], q11          @ get right offset\n"
+          "vbif.f32   q7, %q[vnoff], q8          @ get right offset\n"
          "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
          "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
          "vmla.f32   q6, q2, %q[vscale]          @ mul scale\n"
          "vmla.f32   q7, q3, %q[vscale]          @ mul scale\n"
+          "vcge.f32 q8, q4, %q[vmax]              @ q4 >= vmax \n"
+          "vcge.f32 q9, q5, %q[vmax]              @ q4 >= vmax \n"
+          "vcge.f32 q10, q6, %q[vmax]             @ q4 >= vmax \n"
+          "vbif q4, %q[vmax], q8                  @ choose \n"
+          "vcge.f32 q8, q7, %q[vmax]             @ q4 >= vmax \n"
+          "vbif q5, %q[vmax], q9                  @ choose \n"
+          "vbif q6, %q[vmax], q10                  @ choose \n"
+          "vbif q7, %q[vmax], q8                  @ choose \n"
          "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
          "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
          "vcvt.s32.f32  q2, q6                   @ cvt to int32\n"
@@ -133,25 +150,16 @@ void fp32_to_int8(const float* din,
          : [vscale] "w"(vscale),
            [vpoff] "w"(vpoff),
            [vnoff] "w"(vnoff),
-            [vzero] "w"(vzero)
-          : "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11");
+            [vzero] "w"(vzero),
+            [vmax] "w"(vmax)
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10");
 #endif
    }
    const float* din_r = din_c + 16 * cnt;
    signed char* dout_r = dout_c + 16 * cnt;
    for (int i = 0; i < remain; ++i) {
      dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
+      dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i];
    }
  }
 }

--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -151,6 +151,10 @@ class TensorLite {
  size_t offset() const { return offset_; }

  bool IsInitialized() const { return buffer_->data(); }
+  void clear() {
+    buffer_->Free();
+    offset_ = 0;
+  }

  // Other share data to this.
  void ShareDataWith(const TensorLite &other);

--- a/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data, 
+    int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) {
+  const int index = get_global_id(0); 
+  if (index < size){
+    for (int i = 0; i < pre_size; i++){
+        int offset_out = index * post_size + i * total;
+        int offset_in = index * post_size + i * total0;
+        // memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
+        CL_DTYPE* dst = out_data + offset_out;
+        CL_DTYPE* src = x_data0 + offset_in;
+        for (int k = 0; k < post_size; k++){
+           *dst++ = *src++;
+	}
+    }
+  }else if (index < axis_size){
+    for (int i = 0; i < pre_size; i++){
+        int offset_out = index * post_size + i * total;
+        int offset_in = index * post_size + i * total1;
+        // memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
+        CL_DTYPE* dst = out_data + offset_out;
+        CL_DTYPE* src = x_data1 + offset_in;
+        for (int k = 0; k < post_size; k++){
+           *dst++ = *src++;
+        }
+    }
+  }
+}
+
+__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data, 
+    int axis_size, int pre_size, int post_size, int start, int total, int total0) {
+  const int index = get_global_id(0); 
+  if (index < axis_size){
+    for (int i = 0; i < pre_size; i++){
+        int offset_out = (start + index) * post_size + i * total;
+        int offset_in = index * post_size + i * total0;
+        // memcpy(out_data + offset_out, x_data + offset_in, post_size);
+        CL_DTYPE* dst = out_data + offset_out;
+        CL_DTYPE* src = x_data + offset_in;
+        for (int k = 0; k < post_size; k++){
+           *dst++ = *src++;
+        }
+    }
+  }
+}
--- a/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/sigmoid_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void sigmoid(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) {
+  const int index = get_global_id(0); 
+  if (index < count) {
+    out_data[index] = 1 / (1 + exp(-x_data[index]));
+  }
+}
--- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void concat2(__read_only image2d_t input0,
+                    __read_only image2d_t input1,
+                    __write_only image2d_t output,
+                    int axis_size, int flag, int width) {
+  const int x = get_global_id(0); // image_width cxw/4
+  const int y = get_global_id(1); // image_height nxh
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  int xx = x / width;
+  if (flag == 0){
+    xx = y / width;
+  }
+  if (xx < axis_size){
+    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y));
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  }else{
+    int new_val = xx - axis_size;
+    new_val *= width;
+    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y));
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  }
+  // WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
+
+__kernel void concat_mul(__read_only image2d_t input0,
+                    __write_only image2d_t output,
+                    int axis_size, int flag, int width, int start) {
+  const int x = get_global_id(0); // image_width cxw/4
+  const int y = get_global_id(1); // image_height nxh
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  int xx = x / width;
+  if (flag == 0){
+    xx = y / width;
+  }
+  
+  if (xx < axis_size && xx >= start){
+    xx -= start;
+   xx *= width;
+    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y));
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  }
+  
+}
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void conv2d_3x3(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,/* of one block */
+                         __private const int input_height,/* of one block */
+                         __private const int output_width,
+                         __private const int output_height,
+                         __private const int output_c,
+                         __private const int filter_channel,
+						 __private const int filter_width,
+						 __private const int filter_height,
+                         __private const int group) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+
+
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+#ifdef BIASE_CH
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+    CL_DTYPE4 output = 0.0f;
+#endif
+
+    CL_DTYPE4 input[9]; // 3x3 region of input
+    if (group == 1) {
+        for (int i = 0; i < input_c; ++i) { // each run for 3x3
+            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+
+            input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                                (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                                (CL_DTYPE4)(0.0f),
+                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+            input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x, pos_in.y - dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+            input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+            input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x - dilation, pos_in.y)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+            input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x, pos_in.y)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+            input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x + dilation, pos_in.y)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+            input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+            input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x, pos_in.y + dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+            input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+                int j = 0;
+                int2 pos_of_weight;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y += 3;
+                CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y += 3;
+                CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y += 3;
+                CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 1;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 2;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 3;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 4;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+                j = 5;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+
+               j = 6;
+               pos_of_weight.x = i * 3 + j % 3;
+               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.x += dot(input[j], weight_x);
+
+               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.y += dot(input[j], weight_y);
+
+               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.z += dot(input[j], weight_z);
+
+               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.w += dot(input[j], weight_w);
+
+               j = 7;
+               pos_of_weight.x = i * 3 + j % 3;
+               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.x += dot(input[j], weight_x);
+
+               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.y += dot(input[j], weight_y);
+
+               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.z += dot(input[j], weight_z);
+
+               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.w += dot(input[j], weight_w);
+
+               j = 8;
+               pos_of_weight.x = i * 3 + j % 3;
+               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.x += dot(input[j], weight_x);
+
+               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.y += dot(input[j], weight_y);
+
+               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.z += dot(input[j], weight_z);
+
+               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.w += dot(input[j], weight_w);
+        }
+    } else { // group != 1
+      for (int i = 0; i < 4; i++) {
+        int used_input_channel_num =
+          (out_c * 4 + i) / (output_c / group) * filter_channel;
+        for (int f_c = 0; f_c < filter_channel; ++f_c) {
+          int input_c = used_input_channel_num + f_c;
+          int input_block = input_c / 4;
+          int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
+                               in_pos_in_one_block.y);
+          input[0] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+              (CL_DTYPE4)(0.0f),
+              (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                         in_pos_in_one_block.y - dilation < 0 ||
+                         in_pos_in_one_block.x - dilation >= input_width ||
+                         in_pos_in_one_block.y - dilation >= input_height)
+                        << 15));
+          input[1] =
+              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                                 (int2)(pos_in.x, pos_in.y - dilation)),
+                     (CL_DTYPE4)(0.0f),
+                     (ushort4)((in_pos_in_one_block.x < 0 ||
+                                in_pos_in_one_block.y - dilation < 0 ||
+                                in_pos_in_one_block.x >= input_width ||
+                                in_pos_in_one_block.y - dilation >= input_height)
+                               << 15));
+          input[2] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                         in_pos_in_one_block.y - dilation < 0 ||
+                         in_pos_in_one_block.x + dilation >= input_width ||
+                         in_pos_in_one_block.y - dilation >= input_height)
+                        << 15));
+          input[3] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                                     in_pos_in_one_block.y < 0 ||
+                                     in_pos_in_one_block.x - dilation >= input_width ||
+                                     in_pos_in_one_block.y >= input_height)
+                                    << 15));
+          input[4] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                                     in_pos_in_one_block.x >= input_width ||
+                                     in_pos_in_one_block.y >= input_height)
+                                     << 15));
+          input[5] =
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                               (int2)(pos_in.x + dilation, pos_in.y)),
+                   (CL_DTYPE4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x + dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+          input[6] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                                     in_pos_in_one_block.y + dilation < 0 ||
+                                     in_pos_in_one_block.x - dilation >= input_width ||
+                                     in_pos_in_one_block.y + dilation >= input_height)
+                                     << 15));
+          input[7] =
+              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                                 (int2)(pos_in.x, pos_in.y + dilation)),
+                     (CL_DTYPE4)(0.0f),
+                     (ushort4)((in_pos_in_one_block.x < 0 ||
+                                in_pos_in_one_block.y + dilation < 0 ||
+                                in_pos_in_one_block.x >= input_width ||
+                                in_pos_in_one_block.y + dilation >= input_height)
+                                 << 15));
+          input[8] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                                     in_pos_in_one_block.y + dilation < 0 ||
+                                     in_pos_in_one_block.x + dilation >= input_width ||
+                                     in_pos_in_one_block.y + dilation >= input_height)
+                                      << 15));
+
+          CL_DTYPE tmp_out = 0;
+          for (int j = 0; j < 9; j++) {
+            int2 pos_of_weight;
+            pos_of_weight.x = (f_c / 4) * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
+            CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+
+            int f_c_offset = f_c % 4;
+            CL_DTYPE f_value;
+            if (f_c_offset == 0) {
+              f_value = weight.x;
+            } else if (f_c_offset == 1) {
+              f_value = weight.y;
+            } else if (f_c_offset == 2) {
+              f_value = weight.z;
+            } else if (f_c_offset == 3) {
+              f_value = weight.w;
+            }
+
+            int input_c_offset = input_c % 4;
+            CL_DTYPE input_value;
+            if (input_c_offset == 0) {
+              input_value = input[j].x;
+            } else if (input_c_offset == 1) {
+              input_value = input[j].y;
+            } else if (input_c_offset == 2) {
+              input_value = input[j].z;
+            } else if (input_c_offset == 3) {
+              input_value = input[j].w;
+            }
+            tmp_out += f_value * input_value;
+          }
+
+          if (i == 0) {
+            output.x += tmp_out;
+          } else if (i == 1) {
+            output.y += tmp_out;
+          } else if (i == 2) {
+            output.z += tmp_out;
+          } else if (i == 3) {
+            output.w += tmp_out;
+          }
+        }
+      }
+    }
+
+	output = activation_type4(output);
+
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -142,7 +142,7 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
 #endif

 #ifdef RELU
-    output = activation(output);
+    output = activation_type4(output);
 #endif


@@ -309,8 +309,8 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
 #endif

 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
+    output[0] = activation_type4(output[0]);
+    output[1] = activation_type4(output[1]);
 #endif

    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);

--- a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void scale(__read_only image2d_t input,
+                    __write_only image2d_t output,
+                    __private float scale,
+                    __private float bias){
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  in = convert_float(scale) * in + convert_float(bias);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
--- a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void sigmoid(__read_only image2d_t input,
+                   __write_only image2d_t output) {
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  CL_DTYPE4 out = 1 / (1 + exp(-in));
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
+}
--- a/lite/backends/x86/jit/more/mkl/mkl.h
+++ b/lite/backends/x86/jit/more/mkl/mkl.h
@@ -142,14 +142,13 @@ void StrideScal(const T* a, const T* x, T* y, int n, int stride);
 // remain is the product of dimension shapes after the axis dimension
 template <typename T>
 void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
-  std::vector<T> entities(bs);
  for (int i = 0; i < bs; ++i) {
-    entities[i] = x[i * n];
+    T entity = x[i * n];
    for (int c = 1; c < n; ++c) {
-      entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i];
+      entity = x[i * n + c] > entity ? x[i * n + c] : entity;
    }
    for (int c = 0; c < n; ++c) {
-      y[i * n + c] = x[i * n + c] - entities[i];
+      y[i * n + c] = x[i * n + c] - entity;
    }
  }
  VExp(y, y, n * bs);

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -110,11 +110,7 @@ void set_constant(const lite::Context<Target>& context,
                  lite::Tensor* tensor,
                  float value) {
  TensorSetConstantWithTarget<Target> func(context, tensor, value);
-  //#ifdef PADDLE_WITH_CUDA
-  // tensor->target().apply_visitor(func);
-  //#else
  func();
-  //#endif
 }

 template <typename T>
@@ -123,17 +119,19 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
                  const lite::Tensor& input,
                  const lite::Tensor& vector,
                  lite::Tensor* output) {
-    auto in_dims = input.dims();
+    const auto& in_dims = input.dims();
    auto size = input.numel() / in_dims[0];
    PADDLE_ENFORCE_EQ(vector.numel(), size);
    PADDLE_ENFORCE_EQ(output->dims(), in_dims);

-    auto in = lite::fluid::EigenMatrix<T>::From(input);
-    auto vec = lite::fluid::EigenVector<T>::Flatten(vector);
-    auto out = lite::fluid::EigenMatrix<T>::From(*output);
-
+    const T* input_data = input.data<T>();
+    const T* vector_data = vector.data<T>();
+    T* output_data = output->mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
+      for (int64_t j = 0; j < size; ++j) {
+        output_data[i * in_dims[0] + j] =
+            input_data[i * in_dims[0] + j] + vector_data[j];
+      }
    }
  }
 };

--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -29,6 +29,11 @@ void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
      act_types.push_back("leaky_relu");
      break;
    }
+    if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
+      act_types.push_back("relu6");
+      act_types.push_back("leaky_relu");
+      break;
+    }
  }
  for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
    for (auto act_type : act_types) {

--- a/lite/core/mir/node.cc
+++ b/lite/core/mir/node.cc
@@ -53,6 +53,11 @@ void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
  }
  valid_kernels_ = op_->CreateKernels(valid_places);
 }
+void mir::Node::Stmt::ResetKernels(const std::vector<Place> &valid_places) {
+  CHECK(op_) << "change valid place failed, not created op";
+  valid_kernels_.clear();
+  valid_kernels_ = op_->CreateKernels(valid_places);
+}

 mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
  auto &x = AsArg();

--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -53,6 +53,7 @@ class Node {
                 const std::vector<Place>& valid_places,
                 lite::Scope* scope = nullptr);

+    void ResetKernels(const std::vector<Place>& valid_places);
    std::string op_type() const { return op_info()->Type(); }
    const OpInfo* op_info() const;
    OpInfo* mutable_op_info();

--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -157,7 +157,7 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
                                lite_api::LiteModelType::kNaiveBuffer);
  // Load optimized model
  lite_api::MobileConfig mobile_config;
-  mobile_config.set_model_dir(optimized_model_dir);
+  mobile_config.set_model_from_file(optimized_model_dir + ".nb");
  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
  mobile_config.set_threads(1);
  predictor = lite_api::CreatePaddlePredictor(mobile_config);
@@ -203,7 +203,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
                                 valid_places,
                                 input_tensor_shape,
                                 input_tensor_type,
-                                 FLAGS_optimized_model_dir + "/ref_opt_model");
+                                 FLAGS_optimized_model_dir + "_ref_opt_model");
 // Generate and run optimized model on NPU/XPU as the target predictor
 #ifdef LITE_WITH_NPU
  valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
@@ -217,7 +217,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
                                 valid_places,
                                 input_tensor_shape,
                                 input_tensor_type,
-                                 FLAGS_optimized_model_dir + "/tar_opt_model");
+                                 FLAGS_optimized_model_dir + "_tar_opt_model");
  // Check the difference of the output tensors between reference predictor and
  // target predictor
  CheckOutputTensors(tar_predictor, ref_predictor, output_tensor_type);

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -137,11 +137,15 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {

 void RuntimeProgram::Run() {
  for (auto& inst : instructions_) {
+#ifndef LITE_WITH_FPGA
    if (inst.is_feed_fetch_op()) continue;
+#endif
    inst.Run();
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
+#ifndef LITE_WITH_FPGA
    LITE_PRECISION_PROFILE(inst)
+#endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
  }

--- a/lite/core/version.h.in
+++ b/lite/core/version.h.in
@@ -42,7 +42,7 @@ static std::string version() {

  std::string tag = paddlelite_tag();
  if (tag.empty()) {
-    ss << paddlelite_branch() << "(" << paddlelite_commit() << ")";
+    ss << paddlelite_commit();
  } else {
    ss << tag;
  }

--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
 # C++ Demo
+
+> 欢迎加入PaddleLite百度官方QQ群（696965088），会有专业同学解答您的疑问与困惑。
+
 1. 环境准备
-   - 保证Android NDK在/opt目录下
+   - 一台可以编译PaddleLite的电脑
   - 一台armv7或armv8架构的安卓手机
-2. 编译并运行全量api的demo(注：当编译模式为tiny_pubish时将不存在该demo)
+
+2. 人脸识别和佩戴口罩判断的Demo
+
+参考[源码编译](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/source_compile/)准备编译环境。
+
+执行下面命令，下载PaddleLite代码。
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+```
+
+进入PaddleLite根目录，编译预测库。
+```shell
+./lite/tools/build.sh \
+    --arm_os=android \
+    --arm_abi=armv8 \
+    --arm_lang=gcc \
+    --android_stl=c++_static \
+    --build_extra=ON \
+    --shutdown_log=OFF \
+    full_publish
+```
+
+进入编译目录，下载模型和图片的压缩包，编译可执行文件。
+```shell
+cd build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mask_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mask_detection.tar.gz
+tar zxvf mask_detection.tar.gz
+make
+```
+
+当然，大家也可以通过PaddleHub下载人脸检测模型和口罩佩戴判断模型。
+```
+# 下载paddlehub以后，通过python执行以下代码
+import paddlehub as hub
+pyramidbox_lite_mobile_mask = hub.Module(name="pyramidbox_lite_mobile_mask")
+# 将模型保存在test_program文件夹之中
+pyramidbox_lite_mobile_mask.processor.save_inference_model(dirname="test_program") 
+# 通过以上命令，可以获得人脸检测和口罩佩戴判断模型，分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件，__param__文件是权重文件。
+# 从PaddleHub下载的是预测模型，需要使用PaddleLite提供的model_optimize_tools对预测模型进行转换，请参考[模型转换文档](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/model_optimize_tool/)。
+```
+
+电脑连接安卓手机，将可执行文件、测试图片、模型文件、预测库push到安卓手机上。
+```
+adb push mask_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push face_detection /data/local/tmp
+adb push mask_classification /data/local/tmp
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mask_detection
+```
+
+进入安卓手机，执行demo。
+```
+adb shell
+cd /data/local/tmp
+export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH 
+./mask_detection face_detection mask_classification test.jpg
+```
+
+回到电脑端，将结果取出，查看如下效果图。
+```
+adb pull /data/local/tmp/test_mask_detection_result.jpg ./
+```
+
+![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/74279176-6200cd00-4d55-11ea-9fc0-83cfc2b3b37d.jpg)
+
+3. 编译并运行全量api的demo(注：当编译模式为tiny_pubish时将不存在该demo)
 ```shell
 cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
@@ -17,7 +87,7 @@ adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率

-3. 编译并运行轻量级api的demo
+4. 编译并运行轻量级api的demo
 ```shell
 cd ../mobile_light
 make
@@ -29,7 +99,7 @@ adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率

-4. 编译并运行ssd目标检测的demo
+5. 编译并运行ssd目标检测的demo
 ```shell
 cd ../ssd_detection
 wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
@@ -46,7 +116,7 @@ adb pull /data/local/tmp/test_ssd_detection_result.jpg ./
 ```
 运行成功将在ssd_detection目录下看到生成的目标检测结果图像: test_ssd_detection_result.jpg

-5. 编译并运行yolov3目标检测的demo
+6. 编译并运行yolov3目标检测的demo
 ```shell
 cd ../yolov3_detection
 wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-yolov3.tar.gz
@@ -63,7 +133,7 @@ adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./
 ```
 运行成功将在yolov3_detection目录下看到生成的目标检测结果图像: test_yolov3_detection_result.jpg

-6. 编译并运行物体分类的demo
+7. 编译并运行物体分类的demo
 ```shell
 cd ../mobile_classify
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
@@ -71,41 +141,41 @@ tar zxvf mobilenet_v1.tar.gz
 ./model_optimize_tool optimize model
 make

-adb -s emulator-5554 push mobile_classify /data/local/tmp/
-adb -s emulator-5554 push test.jpg /data/local/tmp/
-adb -s emulator-5554 push labels.txt /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+adb push mobile_classify /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push labels.txt /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobile_classify
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
 ```
 运行成功将在控制台输出预测结果的前5个类别的预测概率
 - 如若想看前10个类别的预测概率，在运行命令输入topk的值即可
    eg:
    ```shell
-    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
    ```
 - 如若想看其他模型的分类结果， 在运行命令输入model_dir 及其model的输入大小即可
    eg:
    ```shell
-    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
    ```
    
-9. 编译含CV预处理库模型单测demo 
+8. 编译含CV预处理库模型单测demo 
 ```shell
 cd ../test_cv
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
 tar zxvf mobilenet_v1.tar.gz
 ./model_optimize_tool optimize model
 make
-adb -s emulator-5554 push test_model_cv /data/local/tmp/
-adb -s emulator-5554 push test.jpg /data/local/tmp/
-adb -s emulator-5554 push labels.txt /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+adb push test_model_cv /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push labels.txt /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/test_model_cv
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率
--- a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv7
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mask_detection: fetch_opencv mask_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mask_detection.o: mask_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mask_detection.o
+	rm -f mask_detection
--- a/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mask_detection/Makefile.android.armv8
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mask_detection: fetch_opencv mask_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mask_detection.o -o mask_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mask_detection.o: mask_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mask_detection.o -c mask_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mask_detection.o
+	rm -f mask_detection
--- a/lite/demo/cxx/mask_detection/mask_detection.cc
+++ b/lite/demo/cxx/mask_detection/mask_detection.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+struct Object {
+  int batch_id;
+  cv::Rect rec;
+  int class_id;
+  float prob;
+};
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(const float* din,
+                     float* dout,
+                     int size,
+                     const std::vector<float> mean,
+                     const std::vector<float> scale) {
+  if (mean.size() != 3 || scale.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img,
+                 int width,
+                 int height,
+                 const std::vector<float>& mean,
+                 const std::vector<float>& scale,
+                 float* data,
+                 bool is_scale = false) {
+  cv::Mat resized_img;
+  cv::resize(
+      img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC);
+  cv::Mat imgf;
+  float scale_factor = is_scale ? 1.f / 256 : 1.f;
+  resized_img.convertTo(imgf, CV_32FC3, scale_factor);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, data, width * height, mean, scale);
+}
+
+void RunModel(std::string det_model_dir,
+              std::string class_model_dir,
+              std::string img_path) {
+  // Prepare
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  float shrink = 0.2;
+  int width = img.cols;
+  int height = img.rows;
+  int s_width = static_cast<int>(width * shrink);
+  int s_height = static_cast<int>(height * shrink);
+
+  // Detection
+  MobileConfig config;
+  config.set_model_dir(det_model_dir);
+
+  // Create Predictor For Detction Model
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // Get Input Tensor
+  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
+  input_tensor0->Resize({1, 3, s_height, s_width});
+  auto* data = input_tensor0->mutable_data<float>();
+
+  // Do PreProcess
+  std::vector<float> detect_mean = {104.f, 117.f, 123.f};
+  std::vector<float> detect_scale = {0.007843, 0.007843, 0.007843};
+  pre_process(img, s_width, s_height, detect_mean, detect_scale, data, false);
+
+  // Detection Model Run
+  predictor->Run();
+
+  // Get Output Tensor
+  std::unique_ptr<const Tensor> output_tensor0(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor0->data<float>();
+  auto shape_out = output_tensor0->shape();
+  int64_t out_len = ShapeProduction(shape_out);
+
+  // Filter Out Detection Box
+  float detect_threshold = 0.3;
+  std::vector<Object> detect_result;
+  for (int i = 0; i < out_len / 6; ++i) {
+    if (outptr[1] >= detect_threshold) {
+      Object obj;
+      int xmin = static_cast<int>(width * outptr[2]);
+      int ymin = static_cast<int>(height * outptr[3]);
+      int xmax = static_cast<int>(width * outptr[4]);
+      int ymax = static_cast<int>(height * outptr[5]);
+      int w = xmax - xmin;
+      int h = ymax - ymin;
+      cv::Rect rec_clip =
+          cv::Rect(xmin, ymin, w, h) & cv::Rect(0, 0, width, height);
+      obj.rec = rec_clip;
+      detect_result.push_back(obj);
+    }
+    outptr += 6;
+  }
+
+  // Classification
+  config.set_model_dir(class_model_dir);
+
+  // Create Predictor For Classification Model
+  predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+  // Get Input Tensor
+  std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(0)));
+  int classify_w = 128;
+  int classify_h = 128;
+  input_tensor1->Resize({1, 3, classify_h, classify_w});
+  auto* input_data = input_tensor1->mutable_data<float>();
+  int detect_num = detect_result.size();
+  std::vector<float> classify_mean = {0.5f, 0.5f, 0.5f};
+  std::vector<float> classify_scale = {1.f, 1.f, 1.f};
+  float classify_threshold = 0.5;
+  for (int i = 0; i < detect_num; ++i) {
+    cv::Rect rec_clip = detect_result[i].rec;
+    cv::Mat roi = img(rec_clip);
+
+    // Do PreProcess
+    pre_process(roi,
+                classify_w,
+                classify_h,
+                classify_mean,
+                classify_scale,
+                input_data,
+                true);
+
+    // Classification Model Run
+    predictor->Run();
+
+    // Get Output Tensor
+    std::unique_ptr<const Tensor> output_tensor1(
+        std::move(predictor->GetOutput(1)));
+    auto* outptr = output_tensor1->data<float>();
+
+    // Draw Detection and Classification Results
+    cv::rectangle(img, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
+    std::string text = outptr[1] > classify_threshold ? "wear mask" : "no mask";
+    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+    double font_scale = 1.f;
+    int thickness = 1;
+    cv::Size text_size =
+        cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+    float new_font_scale = rec_clip.width * 0.7 * font_scale / text_size.width;
+    text_size =
+        cv::getTextSize(text, font_face, new_font_scale, thickness, nullptr);
+    cv::Point origin;
+    origin.x = rec_clip.x + 5;
+    origin.y = rec_clip.y + text_size.height + 5;
+    cv::putText(img,
+                text,
+                origin,
+                font_face,
+                new_font_scale,
+                cv::Scalar(0, 255, 255),
+                thickness,
+                cv::LINE_AA);
+
+    std::cout << "detect face, location: x=" << rec_clip.x
+              << ", y=" << rec_clip.y << ", width=" << rec_clip.width
+              << ", height=" << rec_clip.height
+              << ", wear mask: " << (outptr[1] > classify_threshold)
+              << std::endl;
+  }
+
+  // Write Result to Image File
+  int start = img_path.find_last_of("/");
+  int end = img_path.find_last_of(".");
+  std::string img_name = img_path.substr(start + 1, end - start - 1);
+  std::string result_name = img_name + "_mask_detection_result.jpg";
+  cv::imwrite(result_name, img);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " detction_model_dir classification_model_dir image_path\n";
+    exit(1);
+  }
+  std::string detect_model_dir = argv[1];
+  std::string classify_model_dir = argv[2];
+  std::string img_path = argv[3];
+  RunModel(detect_model_dir, classify_model_dir, img_path);
+  return 0;
+}
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -28,6 +28,9 @@ void RunModel(std::string model_dir) {
  // 1. Set MobileConfig
  MobileConfig config;
  config.set_model_dir(model_dir);
+  // To load model transformed by opt after release/v2.3.0, plese use
+  // `set_model_from_file` listed below.
+  // config.set_model_from_file(model_dir);

  // 2. Create PaddlePredictor by MobileConfig
  std::shared_ptr<PaddlePredictor> predictor =

--- a/lite/demo/cxx/ssd_detection/ssd_detection.cc
+++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc
@@ -82,8 +82,8 @@ void neon_mean_scale(const float* din,
  }
  for (; i < size; i++) {
    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
-    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
-    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
  }
 }

@@ -188,13 +188,12 @@ void RunModel(std::string model_dir, std::string img_path) {
      std::move(predictor->GetOutput(0)));
  auto* outptr = output_tensor->data<float>();
  auto shape_out = output_tensor->shape();
-  int64_t cnt = 1;
-  for (auto& i : shape_out) {
-    cnt *= i;
-  }
+  int64_t cnt = ShapeProduction(shape_out);
  auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.6f, img);
-  std::string result_name =
-      img_path.substr(0, img_path.find(".")) + "_ssd_detection_result.jpg";
+  int start = img_path.find_last_of("/");
+  int end = img_path.find_last_of(".");
+  std::string img_name = img_path.substr(start + 1, end - start - 1);
+  std::string result_name = img_name + "_ssd_detection_result.jpg";
  cv::imwrite(result_name, img);
 }


--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -61,7 +61,7 @@ add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kerne
 ## 3. extra kernels
 add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(density_prior_box_compute_arm ARM extra SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -107,7 +107,7 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
  bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
  bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
-  bool flag_dw_5x5 = pads_all_equal && (kw == 5 && kh == 5 && sw == 1);
+  bool flag_dw_5x5 = pads_all_equal && (kw == 5 && (sw == 1 || sw == 2));
  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;

  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
@@ -152,7 +152,7 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
  bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
  bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
-  bool flag_dw_5x5 = pads_all_equal && (kw == 5 && kh == 5 && sw == 1);
+  bool flag_dw_5x5 = pads_all_equal && (kw == 5 && (sw == 1 || sw == 2));
  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;

  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&

--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
@@ -95,7 +95,7 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {

    CHECK_GE(x_dims.size(), 2UL);
    CHECK_EQ(w_dims.size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
+    CHECK_GE(param.output->dims().size(), 2UL);

    m_ = x_dims.Slice(0, param.in_num_col_dims).production();
    k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();

--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -67,22 +67,22 @@ void LookupTableCompute::Run() {

 REGISTER_LITE_KERNEL(lookup_table,
                     kARM,
-                     kFloat,
+                     kAny,
                     kNCHW,
                     paddle::lite::kernels::arm::LookupTableCompute,
                     def)
    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

 REGISTER_LITE_KERNEL(lookup_table_v2,
                     kARM,
-                     kFloat,
+                     kAny,
                     kNCHW,
                     paddle::lite::kernels::arm::LookupTableCompute,
                     def)
    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/arm/lookup_table_compute.h
+++ b/lite/kernels/arm/lookup_table_compute.h
@@ -21,7 +21,7 @@ namespace lite {
 namespace kernels {
 namespace arm {

-class LookupTableCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class LookupTableCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
 public:
  using param_t = operators::LookupTableParam;


--- a/lite/kernels/arm/lookup_table_compute_test.cc
+++ b/lite/kernels/arm/lookup_table_compute_test.cc
@@ -53,7 +53,7 @@ void lookup_table_compute_ref(const operators::LookupTableParam &param) {

 TEST(lookup_table_arm, retrieve_op) {
  auto lookup_table =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kAny)>(
          "lookup_table");
  ASSERT_FALSE(lookup_table.empty());
  ASSERT_TRUE(lookup_table.front());
@@ -61,7 +61,7 @@ TEST(lookup_table_arm, retrieve_op) {

 TEST(lookup_table_arm, init) {
  LookupTableCompute lookup_table;
-  ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat));
+  ASSERT_EQ(lookup_table.precision(), PRECISION(kAny));
  ASSERT_EQ(lookup_table.target(), TARGET(kARM));
 }

@@ -112,4 +112,4 @@ TEST(lookup_table_arm, compute) {
 }  // namespace lite
 }  // namespace paddle

-USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(lookup_table, kARM, kAny, kNCHW, def);
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/arm/write_to_array_compute.cc
@@ -65,6 +65,6 @@ REGISTER_LITE_KERNEL(write_to_array,
                     paddle::lite::kernels::arm::WriteToArrayCompute,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindOutput("Out", {LiteType::GetTensorListTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -15,7 +15,12 @@ lite_cc_library(subgraph_bridge_softmax_op_bm SRCS softmax_op.cc DEPS ${subgraph
 lite_cc_library(subgraph_bridge_mul_op_bm SRCS mul_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_batch_norm_op_bm SRCS batch_norm_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_scale_op_bm SRCS scale_op.cc DEPS ${bm_subgraph_bridge_deps})
-
+lite_cc_library(subgraph_bridge_concat_op_bm SRCS concat_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_dropout_op_bm SRCS dropout_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_transpose_op_bm SRCS transpose_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reshape_op_bm SRCS reshape_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_norm_op_bm SRCS norm_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_prior_box_op_bm SRCS prior_box_op.cc DEPS ${bm_subgraph_bridge_deps})
 set(bm_subgraph_bridges
        subgraph_bridge_registry
        subgraph_bridge_engine
@@ -28,4 +33,10 @@ set(bm_subgraph_bridges
        subgraph_bridge_mul_op_bm
        subgraph_bridge_batch_norm_op_bm
        subgraph_bridge_scale_op_bm
+        subgraph_bridge_concat_op_bm
+        subgraph_bridge_dropout_op_bm
+        subgraph_bridge_transpose_op_bm
+        subgraph_bridge_reshape_op_bm
+        subgraph_bridge_norm_op_bm
+        subgraph_bridge_prior_box_op_bm
        CACHE INTERNAL "bm_subgraph_bridges")
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
@@ -45,7 +45,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  for (size_t i = 0; i < output_dims.size(); i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
-  CHECK_EQ(op_type, "relu");
+  float alpha = 0.f;
+  if (op_type == "relu") {
+  } else if (op_type == "leaky_relu") {
+    alpha = op_info->GetAttr<float>("alpha");
+  } else {
+    LOG(FATAL) << "[BM] unsupport act type";
+    return FAILED;
+  }
  add_relu_layer(graph->GetCompilerHandle(),
                 const_cast<const int*>(&i_x_shape_data[0]),
                 x_dims.size(),
@@ -53,7 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                 const_cast<const int*>(&i_output_shape_data[0]),
                 output_dims.size(),
                 static_cast<const char*>(output_var_name.c_str()),
-                 0.f,
+                 alpha,
                 -1.f);
  graph->AddNode(output_var_name);
  return SUCCESS;
@@ -65,3 +72,6 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace paddle

 REGISTER_SUBGRAPH_BRIDGE(relu, kBM, paddle::lite::subgraph::bm::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
+                         kBM,
+                         paddle::lite::subgraph::bm::ActConverter);
--- a/lite/kernels/bm/bridges/concat_op.cc
+++ b/lite/kernels/bm/bridges/concat_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_names = op_info->Input("X");
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  const int32_t input_num = x_names.size();
+  int32_t** shape = new int32_t*[input_num];
+  int32_t* dim = new int32_t[input_num];
+  const char** name = new const char*[input_num];
+  for (size_t i = 0; i < x_names.size(); i++) {
+    auto x = scope->FindMutableTensor(x_names[i]);
+    name[i] = x_names[i].c_str();
+    auto x_dims = x->dims();
+    dim[i] = x_dims.size();
+    const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+    shape[i] = new int32_t[x_dims.size()];
+    for (size_t j = 0; j < x_dims.size(); j++) {
+      shape[i][j] = static_cast<int32_t>(x_shape_data[j]);
+    }
+  }
+
+  auto axis = op_info->GetAttr<int>("axis");
+  add_concat_layer(graph->GetCompilerHandle(),
+                   input_num,
+                   shape,
+                   dim,
+                   name,
+                   const_cast<const int*>(&i_output_shape_data[0]),
+                   output_dims.size(),
+                   static_cast<const char*>(output_var_name.c_str()),
+                   axis);
+  for (size_t i = 0; i < x_names.size(); i++) {
+    delete[] shape[i];
+  }
+  delete[] shape;
+  delete[] name;
+  delete[] dim;
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConcatConverter);
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/operators/conv_op.h"
 #include <bmcompiler_if.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
@@ -58,10 +57,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::vector<int32_t> i_output_shape_data(output_dims.size());

  for (size_t i = 0; i < input_dims.size(); i++) {
-    i_input_shape_data[i] = static_cast<int>(input_shape_data[i]);
+    i_input_shape_data[i] = static_cast<int32_t>(input_shape_data[i]);
  }
  for (size_t i = 0; i < output_dims.size(); i++) {
-    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+    i_output_shape_data[i] = static_cast<int32_t>(output_shape_data[i]);
  }
  const float* filter_data =
      const_cast<const float*>(filter->mutable_data<float>());
@@ -69,7 +68,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
  auto strides = op_info->GetAttr<std::vector<int>>("strides");
  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
-
  add_conv_layer(graph->GetCompilerHandle(),
                 const_cast<const int*>(&i_input_shape_data[0]),
                 input_dims.size(),
@@ -104,3 +102,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(conv2d,
                         kBM,
                         paddle::lite::subgraph::bm::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConvConverter);
--- a/lite/kernels/bm/bridges/dropout_op.cc
+++ b/lite/kernels/bm/bridges/dropout_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+  CHECK_EQ(dropout_implementation, "downgrade_in_infer");
+  add_const_binary_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         1.f - dropout_prob,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         BINARY_MUL,
+                         0);
+
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(dropout,
+                         kBM,
+                         paddle::lite::subgraph::bm::DropoutConverter);
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -14,6 +14,7 @@
 #include <bmcompiler_defs.h>
 #include <bmcompiler_if.h>
 #include <bmcompiler_if_lite.h>
+#include <bmcompiler_op_code.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -68,42 +69,52 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  for (size_t i = 0; i < output_dims.size(); i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
-  if (y_is_const) {
-    CHECK_EQ(op_type, "elementwise_add");
-  }
+  auto axis = op_info->GetAttr<int>("axis");
  int op_code{-1};
+  int eltwise_if_code{-1};
  float coeff[2] = {1.f, 1.f};
  if (op_type == "elementwise_mul") {
-    op_code = 0;
+    op_code = BINARY_MUL;
+    eltwise_if_code = 0;
  } else if (op_type == "elementwise_add") {
-    op_code = 1;
+    op_code = BINARY_ADD;
+    eltwise_if_code = 1;
  } else if (op_type == "elementwise_sub") {
-    op_code = 1;
+    op_code = BINARY_SUB;
+    eltwise_if_code = 1;
    coeff[1] = -1.f;
  } else {
    LOG(FATAL) << "UNSUPPORTED ELTWISE OPERATION: " << op_type;
  }
-  if (!y_is_const) {
-    add_eltwise_layer(graph->GetCompilerHandle(),
-                      input_num,
-                      shape,
-                      dim,
-                      name,
-                      const_cast<const int*>(&i_output_shape_data[0]),
-                      output_dims.size(),
-                      static_cast<const char*>(output_var_name.c_str()),
-                      op_code,
-                      coeff);
-  } else {
-    const float* y_data = const_cast<const float*>(y->mutable_data<float>());
-    const float* x_data = const_cast<const float*>(x->mutable_data<float>());
-    bm_add_const_tensor(graph->GetCompilerHandle(),
-                        name[1],
-                        shape[0],
-                        dim[0],
-                        static_cast<bm_data_type_t>(DTYPE_FP32),
-                        static_cast<const void*>(y_data));
-
+  const float* y_data = const_cast<const float*>(y->mutable_data<float>());
+  const float* x_data = const_cast<const float*>(x->mutable_data<float>());
+  auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims");
+  std::vector<int32_t> i_expand_shape_data(3);
+  if (y_is_const) {
+    if (dim[0] == dim[1] || 2 == dim[0]) {
+      bm_add_const_tensor(graph->GetCompilerHandle(),
+                          name[1],
+                          shape[1],
+                          dim[1],
+                          static_cast<bm_data_type_t>(DTYPE_FP32),
+                          static_cast<const void*>(y_data));
+    } else if (1 == dim[1] && 1 == axis) {
+      add_expand_ndims_layer(graph->GetCompilerHandle(),
+                             name[1],
+                             shape[1],
+                             dim[1],
+                             static_cast<const float*>(y_data),
+                             -1,
+                             2,
+                             static_cast<const char*>(unique_op_name.c_str()));
+      name[1] = static_cast<const char*>(unique_op_name.c_str());
+      dim[1] = 3;
+      i_expand_shape_data[0] = i_y_shape_data[0];
+      i_expand_shape_data[1] = 1;
+      i_expand_shape_data[2] = 1;
+      shape[1] = &i_expand_shape_data[0];
+      y_data = nullptr;
+    }
    add_binary_layer_v2(graph->GetCompilerHandle(),
                        name[0],
                        shape[0],
@@ -111,12 +122,23 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                        0,
                        static_cast<const float*>(x_data),
                        name[1],
-                        shape[0],
-                        dim[0],
+                        shape[1],
+                        dim[1],
                        0,
                        static_cast<const float*>(y_data),
                        static_cast<const char*>(output_var_name.c_str()),
-                        0);
+                        op_code);
+  } else {
+    add_eltwise_layer(graph->GetCompilerHandle(),
+                      input_num,
+                      shape,
+                      dim,
+                      name,
+                      const_cast<const int*>(&i_output_shape_data[0]),
+                      output_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      eltwise_if_code,
+                      coeff);
  }
  delete[] shape;
  delete[] name;
@@ -133,3 +155,9 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
                         kBM,
                         paddle::lite::subgraph::bm::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
--- a/lite/kernels/bm/bridges/mul_op.cc
+++ b/lite/kernels/bm/bridges/mul_op.cc
@@ -41,8 +41,10 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  }
  // add reshape layer
  int i_x_reshape_shape_data[2];
-  for (size_t i = 0; i < 2; i++) {
-    i_x_reshape_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  i_x_reshape_shape_data[0] = static_cast<int>(x_shape_data[0]);
+  i_x_reshape_shape_data[1] = 1;
+  for (size_t i = 1; i < x_dims.size(); i++) {
+    i_x_reshape_shape_data[1] *= static_cast<int>(x_shape_data[i]);
  }
  int reshape_param[] = {0, -1};
  auto unique_op_reshape_name =

--- a/lite/kernels/bm/bridges/norm_op.cc
+++ b/lite/kernels/bm/bridges/norm_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+
+  float one = 1.f;
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  add_normalize_layer(graph->GetCompilerHandle(),
+                      const_cast<const int*>(&i_x_shape_data[0]),
+                      x_dims.size(),
+                      static_cast<const char*>(x_var_name.c_str()),
+                      const_cast<const int*>(&i_output_shape_data[0]),
+                      output_dims.size(),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      static_cast<const char*>(unique_op_name.c_str()),
+                      0,
+                      1,
+                      &one,
+                      epsilon);
+
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(norm, kBM, paddle::lite::subgraph::bm::NormConverter);
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -15,10 +15,24 @@
 #pragma once

 USE_SUBGRAPH_BRIDGE(relu, kBM);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kBM);
 USE_SUBGRAPH_BRIDGE(conv2d, kBM);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kBM);
 USE_SUBGRAPH_BRIDGE(elementwise_add, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kBM);
 USE_SUBGRAPH_BRIDGE(pool2d, kBM);
 USE_SUBGRAPH_BRIDGE(softmax, kBM);
 USE_SUBGRAPH_BRIDGE(mul, kBM);
 USE_SUBGRAPH_BRIDGE(batch_norm, kBM);
 USE_SUBGRAPH_BRIDGE(scale, kBM);
+USE_SUBGRAPH_BRIDGE(concat, kBM);
+USE_SUBGRAPH_BRIDGE(dropout, kBM);
+USE_SUBGRAPH_BRIDGE(transpose, kBM);
+USE_SUBGRAPH_BRIDGE(transpose2, kBM);
+USE_SUBGRAPH_BRIDGE(reshape, kBM);
+USE_SUBGRAPH_BRIDGE(reshape2, kBM);
+USE_SUBGRAPH_BRIDGE(flatten, kBM);
+USE_SUBGRAPH_BRIDGE(flatten2, kBM);
+USE_SUBGRAPH_BRIDGE(norm, kBM);
+USE_SUBGRAPH_BRIDGE(prior_box, kBM);
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
@@ -65,6 +65,12 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (pooling_type == "avg") {
    average_exclusive = op_info->GetAttr<bool>("exclusive");
  }
+  if (global_pooling) {
+    paddings[0] = 0;
+    paddings[1] = 0;
+    ksize[0] = i_x_shape_data[2];
+    ksize[1] = i_x_shape_data[3];
+  }
  add_pooling_layer(
      graph->GetCompilerHandle(),
      const_cast<const int*>(&i_x_shape_data[0]),

--- a/lite/kernels/bm/bridges/prior_box_op.cc
+++ b/lite/kernels/bm/bridges/prior_box_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+typedef struct __tag_st_priorbox_param {
+  std::vector<float> min_sizes;
+  std::vector<float> max_sizes;
+  std::vector<float> aspect_ratios;
+  std::vector<float> variances;
+  float step_w;
+  float step_h;
+  float offset;
+  int32_t img_w;
+  int32_t img_h;
+  int32_t prior_num;
+  bool min_max_aspect_ratios_order;
+  bool clip;
+  bool flip;
+} st_priorbox_param;
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>* output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior->push_back(ar);
+      if (flip) {
+        output_aspect_ratior->push_back(1.0f / ar);
+      }
+    }
+  }
+}
+
+float* compute_priorbox_kernel(OpLite* op, st_priorbox_param* param) {
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto var = scope->FindVar(var_var_name)->GetMutable<lite::Tensor>();
+  std::vector<float> expand_aspect_ratios;
+  ExpandAspectRatios(param->aspect_ratios, param->flip, &expand_aspect_ratios);
+  param->aspect_ratios.clear();
+  for (size_t i = 0; i < expand_aspect_ratios.size(); i++) {
+    param->aspect_ratios.push_back(expand_aspect_ratios[i]);
+  }
+  param->prior_num = param->aspect_ratios.size() * param->min_sizes.size();
+  if (param->max_sizes.size() > 0) {
+    param->prior_num += param->max_sizes.size();
+  }
+  int32_t win1 = in_dims[3];
+  int32_t hin1 = in_dims[2];
+  DDim shape_out({hin1, win1, param->prior_num, 4});
+  boxes->Resize(shape_out);
+  var->Resize(shape_out);
+  // boxes->mutable_data<float>();
+  // var->mutable_data<float>();
+  float* cpu_data =
+      static_cast<float*>(malloc(sizeof(float) * boxes->data_size() * 2));
+  CHECK(cpu_data != nullptr);
+  const int32_t width = in_dims[3];
+  const int32_t height = in_dims[2];
+  int32_t img_width = param->img_w;
+  int32_t img_height = param->img_h;
+  if (img_width == 0 || img_height == 0) {
+    img_width = img_dims[3];
+    img_height = img_dims[2];
+  }
+  float step_w = param->step_w;
+  float step_h = param->step_h;
+  if (step_w == 0.f || step_h == 0.f) {
+    step_w = static_cast<float>(img_width) / width;
+    step_h = static_cast<float>(img_height) / height;
+  }
+  float offset = param->offset;
+  int32_t channel_size = height * width * param->prior_num * 4;
+  int32_t idx = 0;
+  ///////////////////////////////////////////////////////////////////////
+  for (int32_t h = 0; h < height; ++h) {
+    for (int32_t w = 0; w < width; ++w) {
+      float center_x = (w + offset) * step_w;
+      float center_y = (h + offset) * step_h;
+      float box_width = 0.f;
+      float box_height = 0.f;
+      float* min_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
+      float* max_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
+      float* com_buf = reinterpret_cast<float*>(
+          malloc(sizeof(float) * expand_aspect_ratios.size() * 4));
+      CHECK(min_buf != nullptr);
+      CHECK(max_buf != nullptr);
+      CHECK(com_buf != nullptr);
+      // LOG(INFO) << "the number of min_size is " << min_sizes_.size();
+      for (size_t s = 0; s < param->min_sizes.size(); ++s) {
+        int32_t min_idx = 0;
+        int32_t max_idx = 0;
+        int32_t com_idx = 0;
+        int32_t min_size = param->min_sizes[s];
+        //! first prior: aspect_ratio = 1, size = min_size
+        box_width = box_height = min_size;
+        //! xmin
+        min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width;
+        //! ymin
+        min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height;
+        //! xmax
+        min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width;
+        //! ymax
+        min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height;
+        if (param->max_sizes.size() > 0) {
+          int max_size = param->max_sizes[s];
+          //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
+          box_width = box_height = sqrtf(min_size * max_size);
+          //! xmin
+          max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width;
+          //! ymin
+          max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height;
+          //! xmax
+          max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width;
+          //! ymax
+          max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height;
+        }
+        //! rest of priors
+        for (size_t r = 0; r < expand_aspect_ratios.size(); ++r) {
+          float ar = expand_aspect_ratios[r];
+          if (fabs(ar - 1.) < 1e-6) {
+            continue;
+          }
+          box_width = min_size * sqrt(ar);
+          box_height = min_size / sqrt(ar);
+          //! xmin
+          com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width;
+          //! ymin
+          com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height;
+          //! xmax
+          com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width;
+          //! ymax
+          com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height;
+        }
+        if (param->min_max_aspect_ratios_order) {
+          memcpy(cpu_data + idx, min_buf, sizeof(float) * min_idx);
+          idx += min_idx;
+          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
+          idx += max_idx;
+          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
+          idx += com_idx;
+        } else {
+          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
+          idx += com_idx;
+          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
+          idx += max_idx;
+        }
+      }
+      free(min_buf);
+      free(max_buf);
+      free(com_buf);
+    }
+  }
+  //! clip the prior's coordidate such that it is within [0, 1]
+  if (param->clip) {
+    for (int32_t d = 0; d < channel_size; ++d) {
+      cpu_data[d] = std::min(std::max(cpu_data[d], 0.f), 1.f);
+    }
+  }
+  //! set the variance.
+  float* ptr = cpu_data + channel_size;
+  int count = 0;
+  for (int32_t h = 0; h < height; ++h) {
+    for (int32_t w = 0; w < width; ++w) {
+      for (int32_t i = 0; i < param->prior_num; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          ptr[count] = param->variances[j];
+          ++count;
+        }
+      }
+    }
+  }
+  return cpu_data;
+}
+
+int PriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // inputs
+  auto in_var_name = op_info->Input("Input").front();
+  auto in = scope->FindVar(in_var_name)->GetMutable<lite::Tensor>();
+  auto in_dims = in->dims();
+  auto img_var_name = op_info->Input("Image").front();
+  auto img = scope->FindVar(img_var_name)->GetMutable<lite::Tensor>();
+  auto img_dims = img->dims();
+  std::vector<int32_t> i_input_shape_data(in_dims.size());
+  for (size_t i = 0; i < in_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int32_t>(in_dims[i]);
+  }
+  // outputs
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto var_var_name = op_info->Output("Variances").front();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  // param
+  st_priorbox_param param;
+  param.clip = op_info->GetAttr<bool>("clip");
+  param.min_sizes = op_info->GetAttr<std::vector<float>>("min_sizes");
+  param.max_sizes = op_info->GetAttr<std::vector<float>>("max_sizes");
+  param.aspect_ratios = op_info->GetAttr<std::vector<float>>("aspect_ratios");
+  param.variances = op_info->GetAttr<std::vector<float>>("variances");
+  param.offset = op_info->GetAttr<float>("offset");
+  if (op_info->HasAttr("flip")) {
+    param.flip = op_info->GetAttr<bool>("flip");
+  }
+  if (op_info->HasAttr("img_w")) {
+    param.img_w = op_info->GetAttr<int32_t>("img_w");
+  }
+  if (op_info->HasAttr("img_h")) {
+    param.img_h = op_info->GetAttr<int32_t>("img_h");
+  }
+  if (op_info->HasAttr("step_w")) {
+    param.step_w = op_info->GetAttr<float>("step_w");
+  }
+  if (op_info->HasAttr("step_h")) {
+    param.step_h = op_info->GetAttr<float>("step_h");
+  }
+  if (op_info->HasAttr("prior_num")) {
+    param.prior_num = op_info->GetAttr<int32_t>("prior_num");
+  }
+  if (op_info->HasAttr("min_max_aspect_ratios_order")) {
+    param.min_max_aspect_ratios_order =
+        op_info->GetAttr<bool>("min_max_aspect_ratios_order");
+  }
+  float* cpu_data = compute_priorbox_kernel(op, &param);
+  compute_priorbox_kernel(op, param);
+  auto boxes_dims = boxes->dims();
+  std::vector<int32_t> i_pri_out_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_pri_out_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  i_pri_out_shape_data[0] *= 2;
+  add_priorbox_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_input_shape_data[0]),
+                     in_dims.size(),
+                     static_cast<const char*>(in_var_name.c_str()),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     boxes_dims.size(),
+                     static_cast<const char*>(unique_op_name.c_str()),
+                     static_cast<const float*>(cpu_data),
+                     param.min_sizes.size(),
+                     const_cast<const float*>(&param.min_sizes[0]),
+                     param.max_sizes.size(),
+                     const_cast<const float*>(&param.max_sizes[0]),
+                     param.aspect_ratios.size(),
+                     const_cast<const float*>(&param.aspect_ratios[0]),
+                     static_cast<int>(param.flip),
+                     static_cast<int>(param.clip),
+                     param.variances.size(),
+                     const_cast<const float*>(&param.variances[0]),
+                     param.img_h,
+                     param.img_w,
+                     param.step_h,
+                     param.step_w,
+                     param.offset);
+  std::vector<int32_t> i_output_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  int32_t* shape[2];
+  int dim[2];
+  const char* name[2];
+  dim[0] = boxes_dims.size();
+  dim[1] = boxes_dims.size();
+  name[0] = static_cast<const char*>(boxes_var_name.c_str());
+  name[1] = static_cast<const char*>(var_var_name.c_str());
+  shape[0] = &i_output_shape_data[0];
+  shape[1] = &i_output_shape_data[0];
+  int split_size = 2;
+  add_tf_split_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_pri_out_shape_data[0]),
+                     boxes_dims.size(),
+                     static_cast<const char*>(unique_op_name.c_str()),
+                     2,
+                     shape,
+                     dim,
+                     name,
+                     boxes_dims.size(),
+                     0,
+                     &split_size,
+                     0);
+  graph->AddNode(boxes_var_name);
+  graph->AddNode(var_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(prior_box,
+                         kBM,
+                         paddle::lite::subgraph::bm::PriorBoxConverter);
--- a/lite/kernels/bm/bridges/reshape_op.cc
+++ b/lite/kernels/bm/bridges/reshape_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int32_t>(x_dims[i]);
+  }
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(output_dims[i]);
+  }
+  // auto axis = op_info->GetAttr<int>("axis");
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       static_cast<const char*>(x_var_name.c_str()),
+                       const_cast<const int*>(&i_x_shape_data[0]),
+                       x_dims.size(),
+                       static_cast<const char*>(output_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       output_dims.size());
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten2,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReshapeConverter);
--- a/lite/kernels/bm/bridges/softmax_op.cc
+++ b/lite/kernels/bm/bridges/softmax_op.cc
@@ -48,7 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  for (size_t i = 0; i < length; i++) {
    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
  }
-  auto axis = op_info->GetAttr<int>("axis");
+  int32_t axis = -1;
+  if (op_info->HasAttr("axis")) {
+    axis = op_info->GetAttr<int>("axis");
+  }
  if (axis < 0) {
    axis += x_dims.size();
  }

--- a/lite/kernels/bm/bridges/transpose_op.cc
+++ b/lite/kernels/bm/bridges/transpose_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
+  }
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+  CHECK_EQ(axis.size(), x_dims.size());
+  add_transpose_layer_v2(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         const_cast<const int*>(&i_x_shape_data[0]),
+                         x_dims.size(),
+                         DTYPE_FP32,
+                         static_cast<const char*>(output_var_name.c_str()),
+                         NULL,
+                         const_cast<const int*>(&axis[0]));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kBM,
+                         paddle::lite::subgraph::bm::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kBM,
+                         paddle::lite::subgraph::bm::TransposeConverter);
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -54,7 +54,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  }
  std::string net_name = "paddle_bitmain";
  __bmcompile_opt(
-      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 2);
+      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 1);
  void* bmodel_data = nullptr;
  unsigned int data_size = 0;
  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
@@ -109,7 +109,6 @@ int SubgraphEngine::BuildDeviceProgram() {
                            net_info_->output_dtypes[i],
                            stage.output_shapes[i]);
  }
-
  return status;
 }


--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
@@ -34,27 +34,29 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto input_type = kernel->GetInputDeclType("Input");
  CHECK(input_type->precision() == PRECISION(kFloat));
  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
-  auto input = scope->FindMutableTensor(input_name);
+  auto input = scope->FindTensor(input_name);
  auto input_dims = input->dims();
-  CHECK_GE(input_dims.size(), 2UL);
+
  auto w_name = op_info->Input("W").front();
  auto w_type = kernel->GetInputDeclType("W");
  CHECK(w_type->precision() == PRECISION(kFloat));
  CHECK(w_type->layout() == DATALAYOUT(kNCHW));
-  auto w = scope->FindMutableTensor(w_name);
+  auto w = scope->FindTensor(w_name);
  auto w_dims = w->dims();
  CHECK_EQ(w_dims.size(), 2UL);
+
  auto out_name = op_info->Output("Out").front();
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindTensor(out_name);
+  auto out_dims = out->dims();
+
  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
  int m = input_dims.Slice(0, in_num_col_dims).production();
  int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
  int n = w_dims[1];
  CHECK_EQ(k * n, w_dims.production());
-  VLOG(3) << "[NPU] input dims: " << input_dims << " w dims: " << w_dims
-          << " m: " << m << " k: " << k << " n: " << n;

  // Create input node and reshape it to (m, k, 1, 1)
  std::shared_ptr<Node> input_node = nullptr;
@@ -76,7 +78,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  transpose_w.Resize({n, k, 1, 1});
  transpose_w.set_persistable(true);
  auto transpose_w_data = transpose_w.mutable_data<float>();
-  auto w_data = w->mutable_data<float>();
+  auto w_data = w->data<float>();
  for (int i = 0; i < k; i++) {
    for (int j = 0; j < n; j++) {
      transpose_w_data[j * k + i] = w_data[i * n + j];
@@ -85,10 +87,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto trans_w_node = graph->Add(w_name, transpose_w);

  // FC node
-  auto fc_node = graph->Add<ge::op::FullConnection>(out_name + "/fc");
+  auto fc_node = graph->Add<ge::op::FullConnection>(out_name);
  auto fc_op = fc_node->data<ge::op::FullConnection>();
  fc_op->set_input_x(*reshaped_input_node->data());
  fc_op->set_input_w(*trans_w_node->data());
+
  // Add bias node if bias tensor exists
  if (HasInputArg(op_info, scope, "Bias")) {
    std::shared_ptr<Node> bias_node = nullptr;
@@ -99,19 +102,23 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      auto bias_type = kernel->GetInputDeclType("Bias");
      CHECK(bias_type->precision() == PRECISION(kFloat));
      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
-      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias = scope->FindTensor(bias_name);
      auto bias_dims = bias->dims();
      CHECK_EQ(bias_dims.production(), n);
      bias_node = graph->Add(bias_name, *bias, {1, n, 1, 1});
    }
    fc_op->set_input_b(*bias_node->data());
  }
-  // Reshape output of FC node from (m, n, 1, 1) to (m, n)
+
+  // Reshape output of FC node from (m, n, 1, 1) to out_shape
  auto reshaped_fc_node = graph->Add<ge::op::Reshape>(out_name);
  auto reshaped_fc_op = reshaped_fc_node->data<ge::op::Reshape>();
  reshaped_fc_op->set_input_tensor(*fc_node->data());
-  reshaped_fc_op->set_attr_shape({m, n});
+  auto out_shape = out_dims.Vectorize();
+  reshaped_fc_op->set_attr_shape(
+      ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
  reshaped_fc_op->set_attr_axis(0);
+
  return REBUILD_WHEN_SHAPE_CHANGED;
 }


--- a/lite/kernels/npu/bridges/matmul_op.cc
+++ b/lite/kernels/npu/bridges/matmul_op.cc
@@ -35,14 +35,14 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto x_type = kernel->GetInputDeclType("X");
  CHECK(x_type->precision() == PRECISION(kFloat));
  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
  auto x_dims = x->dims();

  auto y_name = op_info->Input("Y").front();
  auto y_type = kernel->GetInputDeclType("Y");
  CHECK(y_type->precision() == PRECISION(kFloat));
  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
-  auto y = scope->FindMutableTensor(y_name);
+  auto y = scope->FindTensor(y_name);
  auto y_dims = y->dims();

  if (x_dims.size() == 1 || x_dims.size() != y_dims.size()) {
@@ -50,6 +50,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        << "[NPU] dims size of x and y must be same and greater than 1.";
    return FAILED;
  }
+  if (y_dims.size() == 2 && !y->persistable()) {
+    LOG(WARNING) << "[NPU] y must be const if y is 2-D";
+    return FAILED;
+  }
  if (x_dims.size() > 2 &&
      x_dims.count(0, x_dims.size() - 2) !=
          y_dims.count(0, y_dims.size() - 2)) {
@@ -61,7 +65,7 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
-  auto out = scope->FindMutableTensor(out_name);
+  auto out = scope->FindTensor(out_name);
  auto out_dims = out->dims();

  bool transpose_x = op_info->GetAttr<bool>("transpose_X");
@@ -80,7 +84,6 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    x_node = graph->Add(x_name, *x);
  }

-  // Y node which only supports 2-D persistable tensor
  std::shared_ptr<Node> y_node = nullptr;
  if (graph->Has(y_name)) {
    y_node = graph->Get(y_name);

--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -36,18 +36,27 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto x_type = kernel->GetInputDeclType("X");
  CHECK(x_type->precision() == PRECISION(kFloat));
  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
  auto x_dims = x->dims();
+
  auto y_name = op_info->Input("Y").front();
  auto y_type = kernel->GetInputDeclType("Y");
  CHECK(y_type->precision() == PRECISION(kFloat));
  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
-  auto y = scope->FindMutableTensor(y_name);
+  auto y = scope->FindTensor(y_name);
  auto y_dims = y->dims();
+
  auto out_name = op_info->Output("Out").front();
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindTensor(out_name);
+  auto out_dims = out->dims();
+  if (out_dims.size() > 4) {
+    LOG(WARNING) << "[NPU] not supported above 4-D.";
+    return FAILED;
+  }
+
  int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
  int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
  int m = x_dims.Slice(0, x_num_col_dims).production();
@@ -58,20 +67,20 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k;
  VLOG(3) << "x_name:" << x_name << ", is data: " << graph->Has(x_name);
  VLOG(3) << "y_name:" << y_name << ", is data: " << graph->Has(y_name);
-  CHECK(graph->Has(x_name))
-      << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";

  // X node which supports persistable and non-persistable tensor, and
  // reshape to (m, k)
  std::shared_ptr<Node> x_node = nullptr;
  if (graph->Has(x_name)) {
    x_node = graph->Get(x_name);
-    auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
-    auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
-    reshaped_x_op->set_input_tensor(*x_node->data());
-    reshaped_x_op->set_attr_shape({m, k});
-    reshaped_x_op->set_attr_axis(0);
-    x_node = reshaped_x_node;
+    if (x_dims.size() != 2) {
+      auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
+      auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
+      reshaped_x_op->set_input_tensor(*x_node->data());
+      reshaped_x_op->set_attr_shape({m, k});
+      reshaped_x_op->set_attr_axis(0);
+      x_node = reshaped_x_node;
+    }
  } else {
    x_node = graph->Add(x_name, *x, {m, k});
  }
@@ -81,12 +90,14 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::shared_ptr<Node> y_node = nullptr;
  if (graph->Has(y_name)) {
    y_node = graph->Get(y_name);
-    auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
-    auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
-    reshaped_y_op->set_input_tensor(*y_node->data());
-    reshaped_y_op->set_attr_shape({k, n});
-    reshaped_y_op->set_attr_axis(0);
-    y_node = reshaped_y_node;
+    if (y_dims.size() != 2) {
+      auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
+      auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
+      reshaped_y_op->set_input_tensor(*y_node->data());
+      reshaped_y_op->set_attr_shape({k, n});
+      reshaped_y_op->set_attr_axis(0);
+      y_node = reshaped_y_node;
+    }
  } else {
    y_node = graph->Add(y_name, *y, {k, n});
  }
@@ -96,6 +107,17 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto mul_op = mul_node->data<ge::op::MatMul>();
  mul_op->set_input_x1(*x_node->data());
  mul_op->set_input_x2(*y_node->data());
+
+  if (out_dims.size() != 2) {
+    auto reshaped_out_node = graph->Add<ge::op::Reshape>(out_name);
+    auto reshaped_out_op = reshaped_out_node->data<ge::op::Reshape>();
+    reshaped_out_op->set_input_tensor(*mul_node->data());
+    auto out_shape = out_dims.Vectorize();
+    reshaped_out_op->set_attr_shape(
+        ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+    reshaped_out_op->set_attr_axis(0);
+  }
+
  return REBUILD_WHEN_SHAPE_CHANGED;
 }


--- a/lite/kernels/npu/bridges/mul_op_test.cc
+++ b/lite/kernels/npu/bridges/mul_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/mul_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void mul_ref(const std::shared_ptr<operators::MulOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int32_t x_num_col_dims = op_info->GetAttr<int32_t>("x_num_col_dims");
-  int32_t y_num_col_dims = op_info->GetAttr<int32_t>("y_num_col_dims");
-  auto x_data = x->mutable_data<float>();
-  auto y_data = y->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims);
-  auto y_mat_dims = y->dims().Flatten2D(y_num_col_dims);
-  CHECK_EQ(x_mat_dims[1], y_mat_dims[0]);
-  const int M = x_mat_dims[0];
-  const int K = x_mat_dims[1];
-  const int N = y_mat_dims[1];
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      out_data[m * N + n] = 0;
-      for (int k = 0; k < K; ++k) {
-        out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n];
-      }
-    }
-  }
-}
-
-void test_mul(const std::vector<int64_t>& x_shape,
-              const std::vector<int64_t>& y_shape,
-              int x_num_col_dims,
-              int y_num_col_dims) {
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType("mul"));
-
-  Scope scope;
-  std::string x_var_name("X");
-  std::string y_var_name("Y");
-  std::string out_var_name("Out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize(x_shape);
-  y->Resize(y_shape);
-
-  FillTensor<float, int>(x);
-  FillTensor<float, int>(y);
-
-  // create mul op
-  cpp::OpDesc mul_op_desc;
-  mul_op_desc.SetType("mul");
-  mul_op_desc.SetInput("X", {x_var_name});
-  mul_op_desc.SetInput("Y", {y_var_name});
-  mul_op_desc.SetOutput("Out", {out_var_name});
-  mul_op_desc.SetAttr("x_num_col_dims", static_cast<int>(x_num_col_dims));
-  mul_op_desc.SetAttr("y_num_col_dims", static_cast<int>(y_num_col_dims));
-
-  auto mul_op = CreateOp<operators::MulOpLite>(mul_op_desc, &scope);
-  LauchOp(mul_op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  mul_ref(mul_op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, mul) {
-  test_mul({1, 8, 8, 1}, {1, 8, 2, 2}, 2, 2);
-  test_mul({1, 5, 5, 1}, {1, 5, 7, 7}, 2, 2);
-  test_mul({1, 4, 1, 1}, {4, 8}, 1, 1);
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(mul);
-USE_NPU_BRIDGE(mul);
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -73,8 +73,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      auto shape =
          std::vector<int>(actual_shape_data,
                           actual_shape_data + actual_shape_dims.production());
-      auto out_dims = lite::operators::ValidateShape(shape, x_dims);
-      auto out_shape = out_dims.Vectorize();
+      auto out_shape = lite::operators::ValidateShape(shape, x_dims);
      if (out_shape.size() > 4) {
        LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                        "but Shape has "
@@ -88,8 +87,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    reshape_op->set_input_w(*actual_shape_node->data());
  } else {
    auto shape = op_info->GetAttr<std::vector<int>>("shape");
-    auto out_dims = lite::operators::ValidateShape(shape, x_dims);
-    auto out_shape = out_dims.Vectorize();
+    auto out_shape = lite::operators::ValidateShape(shape, x_dims);
    if (out_shape.size() > 4) {
      LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                      "but shape has "

--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
@@ -42,7 +42,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
-  auto axis = op_info->GetAttr<int>("axis");
+  int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;
  if (axis < 0) {
    axis += x_rank;
  }

--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
@@ -37,7 +37,12 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
+
  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+
  auto axis = op_info->GetAttr<std::vector<int>>("axis");

  // X node

--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -14,12 +14,14 @@ add_kernel(fusion_elementwise_add_activation_opencl
 add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
 add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(sigmoid_opencl OPENCL basic SRCS sigmoid_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(scale_opencl OPENCL basic SRCS scale_compute.cc DEPS ${cl_kernel_deps})

 lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
@@ -51,18 +53,18 @@ lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
             DEPS relu_opencl layout_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

+lite_cc_test(test_sigmoid_opencl SRCS sigmoid_compute_test.cc
+        DEPS sigmoid_opencl layout_opencl op_registry program context
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
 lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
             DEPS depthwise_conv2d_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

-lite_cc_test(test_depthwise_conv2d_basic_opencl SRCS depthwise_conv2d_basic_compute_test.cc
-             DEPS depthwise_conv2d_opencl op_registry program context
+lite_cc_test(test_depthwise_conv2d_image2d_opencl SRCS depthwise_conv2d_image2d_compute_test.cc
+             DEPS conv_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

-#lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc
-#             DEPS conv2d_1x1_opencl op_registry program context
-#             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
 lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
             DEPS reshape_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
@@ -78,6 +80,15 @@ lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
 lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
        DEPS layout_opencl op_registry program context cl_image_converter
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc
+        DEPS concat_opencl layout_opencl op_registry program context
+         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+         
 lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
        DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
+lite_cc_test(test_scale_opencl SRCS scale_compute_test.cc
+             DEPS scale_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
--- a/lite/kernels/opencl/concat_compute.cc
+++ b/lite/kernels/opencl/concat_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/opencl/concat_compute.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+template <>
+void ConcatCompute<PRECISION(kFloat),
+                   DATALAYOUT(kImageDefault)>::PrepareForRun() {
+  auto& context = ctx_->As<OpenCLContext>();
+  concat_param_ = param_.get_mutable<param_t>();
+  if (concat_param_->x.size() == 2) {
+    kernel_func_name_ = "concat2";
+  } else {
+    kernel_func_name_ = "concat_mul";
+  }
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "image/concat_kernel.cl", build_options_);
+  // UpdateParams<kFloat, kImageDefault>();
+  auto axis = concat_param_->axis;
+  auto inputs = concat_param_->x;
+  auto out_dims = concat_param_->output->dims();
+  auto* axis_tensor = concat_param_->axis_tensor;
+  if (axis_tensor != nullptr) {
+    // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+    // axis = axis_tensor_data[0];
+  }
+  auto in_dims = inputs[0]->dims();
+  axis_size_ = out_dims[axis];
+  axis_ = axis;
+  for (int i = 0; i < axis; i++) {
+    pre_size_ *= in_dims[i];
+  }
+  for (int i = axis + 1; i < in_dims.size(); i++) {
+    post_size_ *= in_dims[i];
+  }
+  for (int i = 1; i < inputs.size(); i++) {
+    auto dims = inputs[i]->dims();
+    // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
+    if (in_dims.size() != dims.size()) {
+      printf("input shape must be same \n");
+      return;
+    }
+    for (int i = 0; i < dims.size(); i++) {
+      if (i != axis) {
+        if (in_dims[i] != dims[i]) {
+          printf("input shape must be same \n");
+          return;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+  const auto& x_dims = param.output->dims();
+  auto image_shape = InitImageDimInfoWith(x_dims);
+  auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+  const auto& y_dims = param.output->dims();  // useless: check dim only
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+
+  auto inputs = param.x;
+  int arg_idx = 0;
+  int width = inputs[0]->dims()[-1];
+  auto global_work_size =
+      cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                  static_cast<cl::size_type>(image_shape["height"])};
+  VLOG(4) << TargetToStr(param.output->target());
+  VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+          << image_shape["height"];
+  VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+          << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+  VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+          << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int flag = 1;  // cxw
+  switch (axis_) {
+    case 0:
+      width = x_dims[2];  // n
+      flag = 0;
+      break;
+    case 1:
+      width = x_dims[3];  // c
+      break;
+    case 2:
+      width = x_dims[0];  // h
+      flag = 0;
+      break;
+    case 3:
+    case -1:
+      width = x_dims[1];  // w
+      break;
+    default:
+      printf("this axis: %d does not support \n", axis_);
+  }
+  if (inputs.size() == 2) {
+    auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
+    auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
+    cl_int status = kernel.setArg(arg_idx, *x_buf0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *x_buf1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status =
+        kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, flag);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, width);
+    CL_CHECK_FATAL(status);
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_context()->GetCommandQueue().finish();
+  } else {
+    auto start = 0;
+    for (int i = 0; i < inputs.size(); i++) {
+      arg_idx = 0;
+      auto* x_buf = inputs[i]->data<float, cl::Image2D>();
+      cl_int status = kernel.setArg(arg_idx, *x_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, axis_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, start);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, flag);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, width);
+      CL_CHECK_FATAL(status);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_context()->GetCommandQueue().finish();
+      start += inputs[i]->dims()[axis_];
+    }
+  }
+}
+
+template <>
+std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::doc() {
+  return "Concat using cl::Image, kFloat";
+}
+
+template <>
+void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::PrepareForRun() {
+  auto& context = ctx_->As<OpenCLContext>();
+  concat_param_ = param_.get_mutable<param_t>();
+  if (concat_param_->x.size() == 2) {
+    kernel_func_name_ = "concat2";
+  } else {
+    kernel_func_name_ = "concat_mul";
+  }
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
+
+  //  UpdateParams<kFloat, kImageDefault>();
+  auto axis = concat_param_->axis;
+  auto inputs = concat_param_->x;
+  auto out_dims = concat_param_->output->dims();
+  auto* axis_tensor = concat_param_->axis_tensor;
+  if (axis_tensor != nullptr) {
+    //   auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+    //  axis = axis_tensor_data[0];
+  }
+  auto in_dims = inputs[0]->dims();
+  axis_size_ = out_dims[axis];
+  axis_ = axis;
+  for (int i = 0; i < axis; i++) {
+    pre_size_ *= in_dims[i];
+  }
+  for (int i = axis + 1; i < in_dims.size(); i++) {
+    post_size_ *= in_dims[i];
+  }
+  for (int i = 1; i < inputs.size(); i++) {
+    auto dims = inputs[i]->dims();
+    if (in_dims.size() != dims.size()) {
+      printf("input shape must be same \n");
+      return;
+    }
+    for (int i = 0; i < dims.size(); i++) {
+      if (i != axis) {
+        if (in_dims[i] != dims[i]) {
+          printf("input shape must be same \n");
+          return;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+  const auto& x_dims = param.output->dims();
+  auto image_shape = InitImageDimInfoWith(x_dims);
+  auto* out_buf =
+      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  const auto& y_dims = param.output->dims();  // useless: check dim only
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+
+  auto inputs = param.x;
+  int arg_idx = 0;
+  auto global_work_size = cl::NDRange{axis_size_};
+  int total = axis_size_ * post_size_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  if (inputs.size() == 2) {
+    auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
+    auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
+    auto axis0 = inputs[0]->dims()[axis_];
+    int total0 = axis0 * post_size_;
+    int total1 = (axis_size_ - axis0) * post_size_;
+    cl_int status = kernel.setArg(arg_idx, *x_buf0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *x_buf1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, axis_size_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, pre_size_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, post_size_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, total);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, total0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, total1);
+    CL_CHECK_FATAL(status);
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_buf, event_);
+  } else {
+    auto start = 0;
+    for (int i = 0; i < inputs.size(); i++) {
+      arg_idx = 0;
+      int size = inputs[i]->dims()[axis_];
+      auto* x_buf = inputs[i]->data<float, cl::Buffer>();
+      global_work_size = cl::NDRange{static_cast<size_t>(size)};
+      int total0 = size * post_size_;
+      cl_int status = kernel.setArg(arg_idx, *x_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<int>(size));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, pre_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, post_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, start);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total0);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_buf, event_);
+      start += size;
+    }
+  }
+}
+
+template <>
+std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::doc() {
+  return "Concat using cl::Buffer, kFloat";
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
+                                                     DATALAYOUT(kNCHW)>
+    Concat_buffer;
+
+typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
+                                                     DATALAYOUT(kImageDefault)>
+    Concat_image;
+
+REGISTER_LITE_KERNEL(
+    concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+// REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFloat),
+//                                       DATALAYOUT(kNCHW))})
+//     .BindInput("AxisTensor",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kInt32),
+//                                       DATALAYOUT(kNCHW))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                        PRECISION(kFloat),
+//                                        DATALAYOUT(kNCHW))})
+//     .Finalize();
--- a/lite/kernels/opencl/concat_compute.h
+++ b/lite/kernels/opencl/concat_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+template <PrecisionType Ptype, DataLayoutType layout>
+class ConcatCompute : public KernelLite<TARGET(kOpenCL), Ptype, layout> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  std::string doc();  // override;
+
+  // protected:
+  // void UpdateParams();
+
+  int axis_size_ = 1;
+  int post_size_ = 1;
+  int pre_size_ = 1;
+  int axis_ = 1;
+  param_t* concat_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/opencl/concat_compute_test.cc
+++ b/lite/kernels/opencl/concat_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// //
+// // Licensed under the Apache License, Version 2.0 (the "License");
+// // you may not use this file except in compliance with the License.
+// // You may obtain a copy of the License at
+// //
+// //     http://www.apache.org/licenses/LICENSE-2.0
+// //
+// // Unless required by applicable law or agreed to in writing, software
+// // distributed under the License is distributed on an "AS IS" BASIS,
+// // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// // See the License for the specific language governing permissions and
+// // limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void concat2_compute_ref(const dtype *in0,
+                         const dtype *in1,
+                         const int axis,
+                         const DDim in0_dim,
+                         const DDim in1_dim,
+                         const DDim out_dim,
+                         dtype *out_data) {
+  int pre_size = 1;
+  int post_size = 1;
+  for (int i = 0; i < axis; i++) {
+    pre_size *= in0_dim[i];
+  }
+  for (int i = axis + 1; i < in0_dim.size(); i++) {
+    post_size *= in0_dim[i];
+  }
+  int axis_size = out_dim[axis];
+  for (int i = 0; i < pre_size; i++) {
+    for (int j = 0; j < axis_size; j++) {
+      if (j < in0_dim[axis]) {
+        memcpy(out_data, in0, sizeof(dtype) * post_size);
+        in0 += post_size;
+        out_data += post_size;
+      }
+    }
+  }
+}
+
+template <typename dtype>
+void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
+                            std::vector<const DDim> ins_dim,
+                            int axis,
+                            const DDim out_dim,
+                            dtype *out_data) {
+  int pre_size = 1;
+  int post_size = 1;
+  for (int i = 0; i < axis; i++) {
+    pre_size *= ins_dim[0][i];
+  }
+  for (int i = axis + 1; i < ins_dim[0].size(); i++) {
+    post_size *= ins_dim[0][i];
+  }
+  int axis_size = out_dim[axis];
+  for (int i = 0; i < pre_size; i++) {
+    for (int j = 0; j < ins_data.size(); j++) {
+      int size = post_size * ins_dim[j][axis];
+      memcpy(out_data, ins_data[j], sizeof(dtype) * size);
+      out_data += size;
+    }
+  }
+}
+#if 0   // concat_buffer
+TEST(opencl_concat_buffer, compute) {
+  // prepare data
+  const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
+  lite::Tensor x0, x1, x2, out, out_ref;
+  x0.Resize(x0_dim);
+  x1.Resize(x1_dim);
+  x2.Resize(x2_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+
+  auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x0 = static_cast<float *>(
+      TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
+  auto *mapped_x1 = static_cast<float *>(
+      TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
+  auto *mapped_x2 = static_cast<float *>(
+      TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
+  for (int i = 0; i < x0_dim.production(); i++) {
+    mapped_x0[i] = dist(engine);
+  }
+  for (int i = 0; i < x1_dim.production(); i++) {
+    mapped_x1[i] = dist(engine);
+  }
+  for (int i = 0; i < x2_dim.production(); i++) {
+    mapped_x2[i] = dist(engine);
+  }
+
+  // set param and kernel, then run
+  operators::ConcatParam param;
+  std::vector<lite::Tensor *> ins;
+  ins.push_back(&x0);
+  ins.push_back(&x1);
+  ins.push_back(&x2);
+  auto axis = 1;
+  param.x = ins;
+  param.output = &out;
+  param.axis = axis;
+
+  std::vector<const float *> ins_data;
+  std::vector<const DDim> ins_dim;
+
+  ins_data.push_back(mapped_x0);
+  ins_data.push_back(mapped_x1);
+  ins_data.push_back(mapped_x2);
+  ins_dim.push_back(x0_dim);
+  ins_dim.push_back(x1_dim);
+  ins_dim.push_back(x2_dim);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> concat_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(concat_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(concat_context));
+  kernel->Launch();
+
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.output->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  // run compute ref and check
+  auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+  concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
+
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x0_data, mapped_x0);
+  TargetWrapperCL::Unmap(x1_data, mapped_x1);
+  TargetWrapperCL::Unmap(x2_data, mapped_x2);
+}
+#endif  // concat_buffer
+
+// #define LOOP_TEST
+// #define PRINT_RESULT
+TEST(concat_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+          for (atuo &axis : {0, 1, 2, 3}) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+  const int axis = 1;
+#endif  // LOOP_TEST
+            LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
+                      << " " << h << " " << w << " ========";
+            LOG(INFO) << "======== axis: " << axis;
+            // set layout kernels
+            auto buf_to_img_kernels =
+                KernelRegistry::Global().Create("layout",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kAny),
+                                                DATALAYOUT(kImageDefault));
+            auto buf_to_img_kernels1 =
+                KernelRegistry::Global().Create("layout",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kAny),
+                                                DATALAYOUT(kImageDefault));
+            auto img_to_buf_kernels = KernelRegistry::Global().Create(
+                "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+            auto concat_img_kernels =
+                KernelRegistry::Global().Create("concat",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kFloat),
+                                                DATALAYOUT(kImageDefault));
+            ASSERT_FALSE(buf_to_img_kernels.empty());
+            ASSERT_FALSE(buf_to_img_kernels1.empty());
+            ASSERT_FALSE(img_to_buf_kernels.empty());
+            ASSERT_FALSE(concat_img_kernels.empty());
+
+            auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+            auto buf_to_img_kernel1 = std::move(buf_to_img_kernels1.front());
+            auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+            auto concat_img_kernel = std::move(concat_img_kernels.front());
+            LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+            LOG(INFO) << "get 1st-1 kernel: " << buf_to_img_kernel1->doc();
+            LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+            LOG(INFO) << "get 3rd kernel: " << concat_img_kernel->doc();
+
+            // set tensors about op param
+            LOG(INFO) << "set tensors about op param";
+            lite::Tensor x0, x1, y, concat_in0, concat_in1, concat_out, y_ref;
+            operators::LayoutParam BufferToImageParam0, BufferToImageParam1;
+            operators::LayoutParam ImageToBufferParam;
+            BufferToImageParam0.x = &x0;
+            BufferToImageParam0.y = &concat_in0;
+            BufferToImageParam1.x = &x1;
+            BufferToImageParam1.y = &concat_in1;
+            ImageToBufferParam.x = &concat_out;
+            ImageToBufferParam.y = &y;
+            std::vector<lite::Tensor *> ins;
+            operators::ConcatParam concatParam;
+            ins.push_back(&concat_in0);
+            ins.push_back(&concat_in1);
+            concatParam.x = ins;
+            concatParam.axis = axis;
+            concatParam.output = &concat_out;
+
+            const DDim x0_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+            DDim x1_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+            DDim out_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+            x1_dim[axis] += 2;
+            out_dim[axis] = x0_dim[axis] + x1_dim[axis];
+            x0.Resize(x0_dim);
+            x1.Resize(x1_dim);
+            y.Resize(out_dim);
+            concat_in0.Resize(x0_dim);
+            concat_in1.Resize(x1_dim);
+            concat_out.Resize(out_dim);
+            y_ref.Resize(out_dim);
+            auto concat_image2d_shape =
+                paddle::lite::kernels::opencl::InitImageDimInfoWith(out_dim);
+            auto concat_image2d_shape_in0 =
+                paddle::lite::kernels::opencl::InitImageDimInfoWith(x0_dim);
+            auto concat_image2d_shape_in1 =
+                paddle::lite::kernels::opencl::InitImageDimInfoWith(x1_dim);
+
+            // initialize tensors
+            LOG(INFO) << "initialize tensors";
+            auto *x_data0 = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+            auto *x_data1 = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+            auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+            auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+            auto *mapped_x0 = static_cast<float *>(TargetWrapperCL::Map(
+                x_data0, 0, sizeof(float) * x0_dim.production()));
+            auto *mapped_x1 = static_cast<float *>(TargetWrapperCL::Map(
+                x_data1, 0, sizeof(float) * x1_dim.production()));
+            auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+                y_data, 0, sizeof(float) * out_dim.production()));
+            for (int i = 0; i < x0_dim.production(); ++i) {
+              mapped_x0[i] = static_cast<int>(i) - x0_dim.production() / 2;
+            }
+            for (int i = 0; i < x1_dim.production(); ++i) {
+              mapped_x1[i] = static_cast<int>(i) - x1_dim.production() / 2;
+            }
+            for (int i = 0; i < out_dim.production(); ++i) {
+              mapped_y[i] = static_cast<int>(0);
+            }
+            auto *concat_in_data0 = concat_in0.mutable_data<float, cl::Image2D>(
+                concat_image2d_shape_in0["width"],
+                concat_image2d_shape_in0["height"]);
+            auto *concat_in_data1 = concat_in1.mutable_data<float, cl::Image2D>(
+                concat_image2d_shape_in1["width"],
+                concat_image2d_shape_in1["height"]);
+            auto *concat_out_data = concat_out.mutable_data<float, cl::Image2D>(
+                concat_image2d_shape["width"], concat_image2d_shape["height"]);
+
+            // set context and kernel args
+            LOG(INFO) << "set context and kernel args";
+            std::unique_ptr<KernelContext> context(new KernelContext);
+            context->As<OpenCLContext>().InitOnce();
+
+            buf_to_img_kernel->SetParam(BufferToImageParam0);
+            std::unique_ptr<KernelContext> buf_to_img_context(
+                new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(buf_to_img_context->As<OpenCLContext>()));
+            buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+            buf_to_img_kernel1->SetParam(BufferToImageParam1);
+            std::unique_ptr<KernelContext> buf_to_img_context1(
+                new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(buf_to_img_context1->As<OpenCLContext>()));
+            buf_to_img_kernel1->SetContext(std::move(buf_to_img_context1));
+
+            img_to_buf_kernel->SetParam(ImageToBufferParam);
+            std::unique_ptr<KernelContext> img_to_buf_context(
+                new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(img_to_buf_context->As<OpenCLContext>()));
+            img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+            concat_img_kernel->SetParam(concatParam);
+            std::unique_ptr<KernelContext> concat_img_context(
+                new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(concat_img_context->As<OpenCLContext>()));
+            concat_img_kernel->SetContext(std::move(concat_img_context));
+
+            // run kernels
+            LOG(INFO) << "run kernel: buf_to_img_kernel";
+            buf_to_img_kernel->Launch();
+            buf_to_img_kernel1->Launch();
+            LOG(INFO) << "run kernel: concat_img_kernel";
+            concat_img_kernel->Launch();
+            LOG(INFO) << "run kernel: img_to_buf_kernel";
+            img_to_buf_kernel->Launch();
+
+            // compute ref cp_u
+            std::vector<const float *> ins_ptr;
+            std::vector<const DDim> in_dim;
+            ins_ptr.push_back(mapped_x0);
+            ins_ptr.push_back(mapped_x1);
+            in_dim.push_back(x0_dim);
+            in_dim.push_back(x1_dim);
+            concat_mul_compute_ref<float>(
+                ins_ptr, in_dim, axis, out_dim, y_data_ref);
+// result
+#ifdef PRINT_RESULT
+            LOG(INFO) << "---- print kernel result (input -> output) ----";
+            for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
+              std::cout << mapped_x0[eidx] << ", " << mapped_x1[eidx] << " -> "
+                        << mapped_y[eidx] << std::endl;
+            }
+#endif  // PRINT_RESULT
+
+            // check result: compare kernel output and cpu output(y_data_ref)
+            for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+              EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+              if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+                LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                          << " / " << x0_dim.production() << ", y_data_ref["
+                          << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                          << eidx << "]:" << mapped_y[eidx];
+                break;
+              }
+            }
+            // free
+            LOG(INFO) << "free: unmap x, y";
+            TargetWrapperCL::Unmap(x_data0, mapped_x0);
+            TargetWrapperCL::Unmap(x_data1, mapped_x1);
+            TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+          }  // axis
+        }    // w
+      }      // h
+    }        // c
+  }          // n
+#else
+// nothing to do.
+#endif
+}
+}  // namespace lite
+}  // namespace paddle
+
+// concat buffer
+// USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
+
+// concat image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(concat, kOpenCL, kFloat, kImageDefault, ImageDefault);
--- a/lite/kernels/opencl/conv2d_1x1_compute.cc
+++ b/lite/kernels/opencl/conv2d_1x1_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/image_helper.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-#define USE_BUFFER_FOR_CONV1x1_BIAS
-class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
-                                                  DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    if (param.fuse_relu) {
-      build_options_ += " -DRELU";
-    }
-
-    const bool has_bias = param.bias != nullptr;
-    const bool is_element_wise_bias =
-        has_bias && param.output->dims() == param.bias->dims();
-    if (has_bias) {
-      build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    if (param.x->dims()[1] % 4 == 0) {
-      context.cl_context()->AddKernel(kernel_func_name_simple_,
-                                      "image/conv2d_1x1_kernel.cl",
-                                      build_options_);
-    } else {
-      context.cl_context()->AddKernel(
-          kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_);
-    }
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto input_dims = param.x->dims();
-    auto paddings = *param.paddings;
-    auto strides = param.strides;
-    auto* input_image = param.x->data<float, cl::Image2D>();
-    auto* filter_image = param.filter->data<float, cl::Image2D>();
-    auto filter_dims = param.filter->dims();
-    auto output_dims = param.output->dims();
-
-    int input_width = input_dims[3];
-    int input_height = input_dims[2];
-    int output_width = output_dims[3];
-    int output_height = output_dims[2];
-    auto out_image_shape = InitImageDimInfoWith(output_dims);
-    auto* out_image = param.output->mutable_data<float, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-
-    const bool has_bias = param.bias != nullptr;
-    const bool is_element_wise_bias =
-        has_bias && param.output->dims() == param.bias->dims();
-    int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-                 static_cast<int>(paddings[0]);
-
-    // calc input_c_block
-    auto input_image_shape = InitImageDimInfoWith(input_dims);
-    int input_c_block = input_image_shape["width"] / input_dims[3];
-    int input_c = input_dims[1];
-    auto dilations = *param.dilations;
-
-    const std::vector<size_t>& default_work_size =
-        DefaultWorkSize(output_dims,
-                        DDim(std::vector<DDim::value_type>{
-                            static_cast<int64_t>(out_image_shape["width"]),
-                            static_cast<int64_t>(out_image_shape["height"])}));
-
-    int c_block = default_work_size[0];
-    int w = default_work_size[1];
-    int nh = default_work_size[2];
-
-    VLOG(4) << "============ conv2d_1x1 params ============";
-    VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-            << input_image_shape["height"];
-    VLOG(4) << "input_c_block: " << input_c_block;
-    VLOG(4) << "input_c: " << input_c;
-    VLOG(4) << "input_image: " << input_image;
-    VLOG(4) << "filter_dims: " << filter_dims;
-    VLOG(4) << "filter_image: " << filter_image;
-    VLOG(4) << "output_dims: " << output_dims;
-    VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-            << out_image_shape["height"];
-    VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-    VLOG(4) << "has bias: " << has_bias;
-    VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-    VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-    VLOG(4) << "offset: " << offset;
-    VLOG(4) << "dilations.size : " << dilations.size();
-    VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-    VLOG(4) << "default work size{c_block, w, nh}: "
-            << "{" << c_block << ", " << w << ", " << nh << ""
-            << "}";
-
-    CHECK_GE(dilations.size(), 2);
-    CHECK(dilations[0] == dilations[1]);
-    CHECK_GE(input_dims.size(), 4);
-    CHECK_GE(paddings.size(), 2);
-    CHECK(paddings[0] == paddings[1]);
-    CHECK_GE(strides.size(), 2);
-    CHECK(strides[0] == strides[1]);
-
-    // handle bias  use buffer for channel wise , use image for element wise
-    const cl::Buffer* bias_buf = nullptr;
-    const cl::Image2D* bias_image = nullptr;
-    if (has_bias) {
-#ifndef USE_BUFFER_FOR_CONV1x1_BIAS
-      is_element_wise_bias
-          ? (bias_image = param.bias->data<float, cl::Image2D>())
-          : (bias_buf = param.bias->data<float, cl::Buffer>());
-#else
-      bias_image = param.bias->data<float, cl::Image2D>();
-#endif
-    }
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    if (input_dims[1] % 4 == 0) {
-      kernel_key << kernel_func_name_simple_ << build_options_;
-    } else {
-      kernel_key << kernel_func_name_ << build_options_;
-    }
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int maped_w = maptofactor(w, 4);
-
-    VLOG(4) << "kernel_key: " << kernel_key.str();
-    VLOG(4) << "kernel ready ... " << kernel_key.str();
-    VLOG(4) << "maped_w: " << maped_w;
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, c_block);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, maped_w);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, nh);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_image);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *filter_image);
-    CL_CHECK_FATAL(status);
-    if (has_bias) {
-#ifndef USE_BUFFER_FOR_CONV1x1_BIAS
-      if (is_element_wise_bias != 0) {
-        VLOG(4) << "set bias_image: ";
-        status = kernel.setArg(++arg_idx, *bias_image);
-      } else {
-        VLOG(4) << "set bias_buf: ";
-        status = kernel.setArg(++arg_idx, *bias_buf);
-      }
-#else
-      status = kernel.setArg(++arg_idx, *bias_image);
-#endif
-      CL_CHECK_FATAL(status);
-    }
-    status = kernel.setArg(++arg_idx, *out_image);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, strides[0]);
-    CL_CHECK_FATAL(status);
-
-    status = kernel.setArg(++arg_idx, offset);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_c_block);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_c);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, dilations[0]);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_height);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, output_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, output_height);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, w);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size =
-        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                    static_cast<size_t>(maped_w),
-                    static_cast<size_t>(default_work_size.data()[2])};
-
-    VLOG(4) << "out_image: " << out_image;
-    VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-            << global_work_size[1] << "," << global_work_size[2] << "}";
-
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_image, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"conv2d_1x1"};
-  std::string kernel_func_name_simple_{"conv2d_1x1_simple"};
-  std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(conv2d_1x1,
-                     kOpenCL,
-                     kFloat,
-                     kImageDefault,
-                     paddle::lite::kernels::opencl::Conv2d1x1Image2DCompute,
-                     image2d)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Bias",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageNW))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
--- a/lite/kernels/opencl/conv2d_1x1_compute_test.cc
+++ b/lite/kernels/opencl/conv2d_1x1_compute_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include <random>
-
-#include "lite/backends/opencl/cl_image_converter.h"
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/utils/logging.h"
-
-namespace paddle {
-namespace lite {
-
-template <typename Dtype1, typename Dtype2>
-static void conv_basic(const Dtype1* din,
-                       Dtype2* dout,
-                       int num,
-                       int chout,
-                       int hout,
-                       int wout,
-                       int chin,
-                       int hin,
-                       int win,
-                       const Dtype1* weights,
-                       const Dtype2* bias,
-                       int group,
-                       int kernel_w,
-                       int kernel_h,
-                       int stride_w,
-                       int stride_h,
-                       int dila_w,
-                       int dila_h,
-                       int pad_w,
-                       int pad_h,
-                       bool flag_bias,
-                       bool flag_relu) {
-  Dtype2 beta = 0;
-  auto src_data = din;
-  auto dst_data_ref = dout;
-  auto weights_data = weights;
-  auto with_bias = flag_bias;
-  auto bias_data = bias;
-
-  int in_num = num;
-  int out_channels = chout;
-  int out_h = hout;
-  int out_w = wout;
-
-  int in_channel = chin;
-  int in_h = hin;
-  int in_w = win;
-  int out_c_group = out_channels / group;
-  int in_c_group = in_channel / group;
-
-  for (int n = 0; n < in_num; ++n) {
-    for (int g = 0; g < group; ++g) {
-      for (int oc = 0; oc < out_c_group; ++oc) {
-        for (int oh = 0; oh < out_h; ++oh) {
-          for (int ow = 0; ow < out_w; ++ow) {
-            int out_idx = n * group * out_c_group * out_h * out_w +
-                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
-                          oh * out_w + ow;
-            Dtype2 bias_d =
-                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
-            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
-            for (int ic = 0; ic < in_c_group; ++ic) {
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int iw = ow * stride_w - pad_w + kw * (dila_w);
-                  int ih = oh * stride_h - pad_h + kh * (dila_h);
-                  if (iw < 0 || iw >= in_w) continue;
-                  if (ih < 0 || ih >= in_h) continue;
-
-                  int iidx = n * in_channel * in_h * in_w +
-                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
-                             ih * in_w + iw;
-                  int widx =
-                      g * out_c_group * in_c_group * kernel_h * kernel_w +
-                      oc * in_c_group * kernel_h * kernel_w +
-                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
-
-                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
-                }
-              }
-            }
-            if (flag_relu) {
-              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
-                                          ? dst_data_ref[out_idx]
-                                          : (Dtype2)0;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-TEST(conv2d_1x1, compute) {
-  // conv infos
-  const int ksize = 1;
-  const int stride = 1;
-  const int pad = 0;
-  const int group = 1;
-  const int dilation = 0;
-//  int loop_cnt = 0;
-
-#ifdef LOOP_TEST
-  for (int batch_size = 1; batch_size < 4; ++batch_size) {
-    for (int oc = 4; oc < 10; oc += 1) {                       // oc
-      for (int ih = 4; ih < 9; ih += 1) {                      // ih
-        /*int iw = ih;*/ for (int iw = 4; iw < 10; iw += 1) {  // iw
-          for (int ic = 4; ic < 10; ic += 1) {                 // ic
-            for (bool bias_flag : {true, false}) {
-              for (bool relu_flag : {true, false}) {
-#else
-  const int batch_size = 1;
-  const int oc = 4;
-  const int ih = 8;
-  const int iw = 8;
-  const int ic = 4;
-  const bool bias_flag = false;
-  const bool relu_flag = false;
-#endif
-                const int oh = ih;
-                const int ow = iw;
-
-                VLOG(4) << "to get kernel ...";
-                auto kernels =
-                    KernelRegistry::Global().Create("conv2d_1x1",
-                                                    TARGET(kOpenCL),
-                                                    PRECISION(kFloat),
-                                                    DATALAYOUT(kImageDefault));
-                ASSERT_FALSE(kernels.empty());
-
-                auto kernel = std::move(kernels.front());
-                VLOG(4) << "created conv2d_1x1 kernel";
-
-                VLOG(4) << "prepare kernel ------";
-
-                lite::Tensor input, filter, bias, output;
-                operators::ConvParam param;
-                param.x = &input;
-                param.filter = &filter;
-                param.output = &output;
-                if (bias_flag) {
-                  param.bias = &bias;
-                }
-                param.fuse_relu = relu_flag;
-
-                std::vector<int> paddings = {pad, pad, pad, pad};
-                std::vector<int> dilations = {dilation, dilation};
-
-                param.paddings = std::make_shared<std::vector<int>>(paddings);
-                param.dilations = std::make_shared<std::vector<int>>(dilations);
-                param.strides = std::vector<int>{stride, stride};
-
-                std::unique_ptr<KernelContext> context(new KernelContext);
-                context->As<OpenCLContext>().InitOnce();
-
-                std::unique_ptr<KernelContext> conv_1x1_context(
-                    new KernelContext);
-                context->As<OpenCLContext>().CopySharedTo(
-                    &(conv_1x1_context->As<OpenCLContext>()));
-                kernel->SetContext(std::move(conv_1x1_context));
-
-                const DDim& input_dim =
-                    lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
-
-                const DDim& filter_dim =
-                    lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
-                const DDim& out_dim =
-                    lite::DDim{std::vector<int64_t>({batch_size, oc, ih, iw})};
-                // element wise bias
-                const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
-
-                param.x->Resize(input_dim);
-                param.filter->Resize(filter_dim);
-                param.output->Resize(out_dim);
-                if (bias_flag) {
-                  param.bias->Resize(bias_dim);
-                }
-
-                kernel->SetParam(param);
-
-                size_t input_image_width = iw * ((ic + 3) / 4);
-                size_t input_image_height = ih * batch_size;
-
-                size_t out_image_width = ow * ((oc + 3) / 4);
-                size_t out_image_height = oh * batch_size;
-
-                size_t bias_image_width = ow * ((oc + 3) / 4);
-                size_t bias_image_height = oh * batch_size;
-
-                size_t filter_image_width = ksize * ((oc + 3) / 4);
-                size_t filter_image_height = ic * ksize;
-
-                const size_t cl_image2d_row_pitch{0};
-                const size_t cl_image2d_slice_pitch{0};
-
-                std::default_random_engine engine;
-                std::uniform_real_distribution<float> gen(-5, 5);
-
-                std::vector<float> input_v(batch_size * ic * ih * iw);
-                std::vector<float> filter_v(oc * ic * ksize * ksize);
-                std::vector<float> output_v(batch_size * oc * ih * iw);
-                std::vector<float> bias_v(oc);
-
-                VLOG(4) << "gen input and filter ...";
-
-                for (auto& i : input_v) {
-                  i = gen(engine);
-                }
-                for (auto& f : filter_v) {
-                  f = gen(engine);
-                }
-
-                VLOG(4) << "after gen input and filter ...";
-                VLOG(4) << "input_v.size(): " << input_v.size();
-                VLOG(4) << "filter_v.size(): " << filter_v.size();
-                VLOG(4) << "output_v.size(): " << output_v.size();
-                VLOG(4) << "bias_v.size(): " << bias_v.size();
-                VLOG(4) << "input_dim.production(): " << input_dim.production();
-                VLOG(4) << "filter_dim.production(): "
-                        << filter_dim.production();
-                VLOG(4) << "out_dim.production(): " << out_dim.production();
-                VLOG(4) << "bias_dim.production(): " << bias_dim.production();
-                VLOG(4) << "4 * input_image_height * input_image_width: "
-                        << 4 * input_image_height * input_image_width;
-                VLOG(4) << "4 * filter_image_width * filter_image_height: "
-                        << 4 * filter_image_width * filter_image_height;
-
-                CHECK(input_dim.production() == input_v.size());
-                CHECK_LE(input_dim.production(),
-                         4 * input_image_height * input_image_width);
-                CHECK(filter_dim.production() == filter_v.size());
-                CHECK_LE(filter_dim.production(),
-                         4 * filter_image_width * filter_image_height);
-
-                paddle::lite::CLImageConverterDefault default_convertor;
-                VLOG(4) << "set mapped input  ...";
-                std::vector<float> x_image_v(
-                    input_image_width * input_image_height * 4);  // 4 : RGBA
-                std::vector<float> filter_image_v(
-                    filter_image_width * filter_image_height * 4);  // 4 : RGBA
-                std::vector<float> bias_image_v(
-                    bias_image_width * bias_image_height * 4);  // 4 : RGBA
-                std::vector<float> out_image_v(
-                    out_image_width * out_image_height * 4);  // 4 : RGBA
-
-                default_convertor.NCHWToImage(
-                    input_v.data(), x_image_v.data(), input_dim);
-
-                /*                for (int j = 0; j < input_v.size(); j += 1) {
-                                  //                  VLOG(4) << "input_v
-                   input[" << j << "]:
-                                  //                  " << input_v.data()[j];
-                                  std::cout << j << "  " << input_v.data()[j] <<
-                   std::endl;
-                                }
-                                std::cout << std::endl;
-
-                                for (int j = 0; j < x_image_v.size(); j += 1) {
-                                  //                  VLOG(4) << "x_image_v
-                   input[" << j <<
-                                  //                  "]: " <<
-                   x_image_v.data()[j];
-                                  std::cout << j << "  " << x_image_v.data()[j]
-                   << std::endl;
-                                }*/
-
-                VLOG(4) << "set mapped filter  ...";
-                paddle::lite::CLImageConverterNWBlock nw_convertor;
-                nw_convertor.NCHWToImage(
-                    filter_v.data(), filter_image_v.data(), filter_dim);
-
-                auto* input_image2d = input.mutable_data<float, cl::Image2D>(
-                    input_image_width, input_image_height, x_image_v.data());
-                auto* filter_image2d = filter.mutable_data<float, cl::Image2D>(
-                    filter_image_width,
-                    filter_image_height,
-                    filter_image_v.data());
-
-                if (bias_flag) {
-                  nw_convertor.NCHWToImage(
-                      filter_v.data(), filter_image_v.data(), filter_dim);
-
-                  for (int i = 0; i < bias_dim.production(); ++i) {
-                    bias_v[i] = static_cast<int>(gen(engine));
-                  }
-                  CLImageConverterFolder folder_convertor;
-                  folder_convertor.NCHWToImage(
-                      bias_v.data(), bias_image_v.data(), bias_dim);
-                  auto* bias_data = bias.mutable_data<float, cl::Image2D>(
-                      bias_image_width, bias_image_height, bias_image_v.data());
-                }
-
-                VLOG(4) << "resize output  ...";
-                output.Resize(out_dim);
-
-                // cpu conv basic calc
-                lite::Tensor out_ref;
-                out_ref.Resize(out_dim);
-
-                VLOG(4) << "prepare kernel ready";
-
-                VLOG(4) << "kernel launch ...";
-                kernel->Launch();
-                VLOG(4) << "mutable output ...";
-                auto* output_image2d = output.mutable_data<float, cl::Image2D>(
-                    out_image_width, out_image_height);
-
-                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-                auto* out_ptr = param.output->data<float, cl::Image2D>();
-                auto it = wait_list->find(out_ptr);
-
-                if (it != wait_list->end()) {
-                  VLOG(4) << "--- Find the sync event for the target cl "
-                             "tensor. ---";
-                  auto& event = *(it->second);
-                  event.wait();
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
-
-                TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                            output.data<float, cl::Image2D>(),
-                                            out_image_width,
-                                            out_image_height,
-                                            cl_image2d_row_pitch,
-                                            cl_image2d_slice_pitch,
-                                            IoDirection::DtoH);
-
-                DDim out_image_shape =
-                    default_convertor.InitImageDimInfoWith(output.dims());
-
-                default_convertor.ImageToNCHW(out_image_v.data(),
-                                              output_v.data(),
-                                              out_image_shape,
-                                              output.dims());
-                VLOG(4) << "mutable_data out_ref_data: ";
-
-                // run cpu ref
-                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-
-                VLOG(4) << " conv_basic beigin ..... ";
-
-                conv_basic<float, float>(input_v.data(),
-                                         out_ref_data,
-                                         batch_size,
-                                         oc,
-                                         oh,
-                                         ow,
-                                         ic,
-                                         ih,
-                                         iw,
-                                         filter_v.data(),
-                                         bias_v.data(),  // mapped_bias,
-                                         group,
-                                         ksize,
-                                         ksize,
-                                         stride,
-                                         stride,
-                                         dilation,
-                                         dilation,
-                                         pad,
-                                         pad,
-                                         bias_flag,
-                                         relu_flag);
-                VLOG(4) << " conv_basic end ..... ";
-
-                VLOG(4) << " out_dim: " << out_dim;
-                const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
-                    {static_cast<int64_t>(out_image_width),
-                     static_cast<int64_t>(out_image_height)})};
-
-                for (int i = 0; i < out_dim.production(); i++) {
-                  EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
-                  if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
-                    LOG(FATAL) << "error idx:" << i;
-                  }
-                }
-
-#ifdef LOOP_TEST
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-// nothing to do.
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, kImageDefault, image2d);
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -362,6 +362,58 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

    impl_ = &ConvImageCompute::Conv2d1x1;
+#if 1  // TODO(ysh329): enable general dwconv
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]) {
+#else  // TODO(ysh329): remove dwconv3x3s1 and dwconv3x3 temporarily, need fix
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
+             kernel_h == 3 && kernel_w == 3 && groups > 1) {
+    // depth_conv2d_3x3s1, depth_conv2d_3x3
+    if (stride_h == 1 && dilations[0] == 1) {
+      kernel_func_names_.push_back("depth_conv2d_3x3s1");
+      impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
+    } else {
+      kernel_func_names_.push_back("depth_conv2d_3x3");
+      impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
+    }
+    kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
+
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
+             kernel_h != 3) {
+#endif
+    // depth_conv2d
+    kernel_func_names_.push_back("depth_conv2d");
+    kernel_func_paths_.push_back("image/depthwise_conv2d_basic_kernel.cl");
+
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::DepthwiseConv2d;
+  } else if (kernel_h == 3 && kernel_h == 3) {
+    // conv2d_3x3
+    kernel_func_names_.push_back("conv2d_3x3");
+    kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d3x3;
  } else if (kernel_h == 5 && kernel_w == 5) {
    // conv2d_5x5
    kernel_func_names_.push_back("conv2d_5x5");
@@ -393,6 +445,8 @@ void ConvImageCompute::PrepareForRun() {
  } else {
    LOG(FATAL) << "conv image compute not support this condition yet! ";
  }
+  VLOG(1) << "kernel_func_names_[0]:" << kernel_func_names_[0]
+          << " kernel_func_paths_[0]:" << kernel_func_paths_[0];

  std::string build_options_single(" -DCL_DTYPE_float");
  // relu options
@@ -582,6 +636,184 @@ void ConvImageCompute::Conv2d1x1() {
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
 }
+
+void ConvImageCompute::Conv2d3x3() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  int filter_channel = filter_dims[1];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  // re-calc group
+  int new_groups{param.groups};
+  if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
+    new_groups = 1;
+  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
+    new_groups = input_channel / filter_channel;
+  }
+  /* TODO(ysh329): mobile has no case below
+     else {
+      LOG(FATAL) << "Not support conv3x3 case with"
+                 << " input_dims:" << input_dims << " output_dims:" <<
+    output_dims
+                 << " filter_dims:" << filter_dims;
+    }
+  */
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "param.groups(groups):" << param.groups;
+  VLOG(4) << "new_groups:" << new_groups;
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, new_groups);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
 void ConvImageCompute::Conv2d5x5() {
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
@@ -726,6 +958,7 @@ void ConvImageCompute::Conv2d5x5() {
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
 }
+
 void ConvImageCompute::Conv2d7x7() {
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
@@ -871,6 +1104,326 @@ void ConvImageCompute::Conv2d7x7() {
  context.cl_wait_list()->emplace(out_image, event_);
 }

+void ConvImageCompute::DepthwiseConv2d3x3s1() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* input_img = param.x->data<float, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<float, cl::Image2D>();
+
+  const cl::Image2D* bias_img = nullptr;
+  if (param.bias) {
+    bias_img = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto image_shape = InitImageDimInfoWith(output_dims);
+
+  auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int c_block = (output_dims[1] + 3) / 4;
+  int w = output_dims[3];
+  int nh = output_dims[0] * output_dims[2];
+
+  int w_blk_size = 2;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+  auto global_work_size = cl::NDRange(c_block, w_blk, nh);
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *output_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+  CL_CHECK_FATAL(status);
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(output_img, event_);
+}
+
+void ConvImageCompute::DepthwiseConv2d3x3() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+  int offset = filter_dims[2] / 2 - paddings[0];
+  int input_c_block = (x_dims[1] + 3) / 4;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* input_img = param.x->data<float, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<float, cl::Image2D>();
+
+  const cl::Image2D* bias_img = nullptr;
+  if (param.bias) {
+    bias_img = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto image_shape = InitImageDimInfoWith(output_dims);
+
+  auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int c_block = (output_dims[1] + 3) / 4;
+  int w = output_dims[3];
+  int nh = output_dims[0] * output_dims[2];
+  auto global_work_size = cl::NDRange(c_block, w, nh);
+
+  VLOG(4) << "setArg";
+  VLOG(4) << "c_block = " << c_block;
+  VLOG(4) << "w = " << w;
+  VLOG(4) << "nh = " << nh;
+
+  VLOG(4) << "strides = " << strides[0];
+  VLOG(4) << "offset = " << offset;
+  VLOG(4) << "dilations = " << dilations[0];
+  VLOG(4) << "input_c_block = " << input_c_block;
+  VLOG(4) << "x_dims[3] = " << x_dims[3];
+  VLOG(4) << "x_dims[2] = " << x_dims[2];
+  VLOG(4) << "output_dims[3] = " << output_dims[3];
+  VLOG(4) << "output_dims[2] = " << output_dims[2];
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(w));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *output_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+  CL_CHECK_FATAL(status);
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(output_img, event_);
+}
+
+void ConvImageCompute::DepthwiseConv2d() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ depthwise conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  // handle bias  use buffer for channel wise , use image for element wise
+  const cl::Buffer* bias_buf = nullptr;
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
 void ConvImageCompute::Run() { (this->*impl_)(); }

 }  // namespace opencl
@@ -878,19 +1431,37 @@ void ConvImageCompute::Run() { (this->*impl_)(); }
 }  // namespace lite
 }  // namespace paddle

+// REGISTER_LITE_KERNEL(conv2d,
+//                      kOpenCL,
+//                      kFloat,
+//                      kNCHW,
+//                      paddle::lite::kernels::opencl::ConvCompute,
+//                      def)
+//     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .Finalize();
+
 REGISTER_LITE_KERNEL(conv2d,
                     kOpenCL,
                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::ConvCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ConvImageCompute,
+                     image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
    .Finalize();

-REGISTER_LITE_KERNEL(conv2d,
+REGISTER_LITE_KERNEL(depthwise_conv2d,
                     kOpenCL,
                     kFloat,
                     kImageDefault,

--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
@@ -71,8 +71,12 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),

 private:
  void Conv2d1x1();
+  void Conv2d3x3();
  void Conv2d5x5();
  void Conv2d7x7();
+  void DepthwiseConv2d3x3s1();
+  void DepthwiseConv2d3x3();
+  void DepthwiseConv2d();

  kernel_t impl_;
  std::vector<std::string> kernel_func_names_{};

--- a/lite/kernels/opencl/conv_compute_test.cc
+++ b/lite/kernels/opencl/conv_compute_test.cc
@@ -166,6 +166,8 @@ void PrintData(std::string name,
  }
 }

+// buffer
+#if 0
 // #define PRINT_RESULT
 #define LOOP_TEST
 TEST(conv2d, compute_conv2d_1x1) {
@@ -623,8 +625,9 @@ TEST(conv2d, compute_conv2d_gemm) {
  }              // batch_size
 #endif
 }
+#endif

 }  // namespace lite
 }  // namespace paddle

-USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc
@@ -446,6 +446,373 @@ TEST(conv2d, compute_image2d_1x1) {
 #undef LOOP_TEST
 #undef PRINT_RESULT

+// #define PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_3x3) {
+  // conv infos
+  const int ksize = 3;
+//  int loop_cnt = 0;
+
+#ifdef LOOP_TEST
+  const int pad = 1;
+  const int dilation = 1;
+  const int stride = 2;
+  const int group = 1;
+  for (int batch_size = 1; batch_size < 2; ++batch_size) {
+    for (int oc = 1; oc < 10; oc += 1) {   // oc
+      for (int ih = 5; ih < 9; ih += 1) {  // ih
+        int iw = ih;
+        for (int ic = 1; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {/*true,*/ "relu"}) {
+#else
+                const int pad = 1;
+                const int dilation = 1;
+
+#if 0  // small scale with group, but result of cpu reference is wrong
+                const int stride = 2;
+                const int group = 2;
+                const int batch_size = 1;
+                const int ic = 1;
+                const int ih = 3;
+                const int iw = 3;
+                const int oc = 2;
+#else  // big scale with group
+                const int stride = 1;
+                const int group = 32;
+                const int batch_size = 1;
+                const int ic = 32;
+                const int ih = 112;
+                const int iw = 112;
+                const int oc = 32;
+#endif
+
+                const bool bias_flag = false;
+                const std::string relu_flag = "relu";
+#endif
+              int filter_channel = ic;
+              if (group > 1) {
+                filter_channel = 1;
+              }
+
+              const int oh =
+                  ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
+              const int ow =
+                  ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
+              CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
+
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d kernel";
+
+              SHADOW_LOG << "prepare kernel ------";
+
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              param.groups = group;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
+
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
+
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
+
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
+
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+              const DDim& filter_dim = lite::DDim{
+                  std::vector<int64_t>({oc, filter_channel, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+              VLOG(2) << "input_dim:" << input_dim
+                      << " filter_dim:" << filter_dim << " out_dim:" << out_dim
+                      << " bias_flag:" << bias_flag << " bias_dim:" << bias_dim
+                      << " group:" << group << " stride:" << stride
+                      << " pad:" << pad << " dilation:" << dilation;
+
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
+
+              kernel->SetParam(param);
+
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
+
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
+
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+
+              size_t filter_image_width = ksize * ((filter_channel + 3) / 4);
+              size_t filter_image_height = oc * ksize;
+
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * filter_channel * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * oh * ow);
+              std::vector<float> bias_v(oc);
+
+              SHADOW_LOG << "gen input and filter ...";
+              for (int i = 0; i < input_v.size(); ++i) {
+                input_v[i] = i;  // gen(engine);
+              }
+              for (int i = 0; i < filter_v.size(); ++i) {
+                filter_v[i] = 1;  // gen(engine);
+              }
+
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "input_image_height:" << input_image_height
+                         << " input_image_width:" << input_image_width;
+              SHADOW_LOG << "filter_image_height:" << filter_image_height
+                         << " filter_image_width:" << filter_image_width;
+              SHADOW_LOG << "4 * input_image_height *input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<float> x_image_v(input_image_width *
+                                           input_image_height * 4);  // 4 :RGBA
+              std::vector<float> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
+              std::vector<float> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<float> out_image_v(out_image_width *
+                                             out_image_height * 4);  // 4 :RGBA
+
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+              SHADOW_LOG << "输入: ----  ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << "输入image : ----  ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+              }
+              SHADOW_LOG << "set mapped filter  ...";
+              CLImageConverterFolder folder_convertor;
+
+              folder_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+              SHADOW_LOG << "卷积核: ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+              SHADOW_LOG << "卷积核image: ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+              }
+              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+              // filter kernel
+              //              auto* filter_image2d = filter.mutable_data<float,
+              //              cl::Image2D>(
+              //                  filter_image_width,
+              //                  filter_image_height,
+              //                  filter_image_v.data());
+
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
+                }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
+
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+
+              SHADOW_LOG << "prepare kernel ready";
+
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                  out_image_width, out_image_height);
+
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<float, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+
+              SHADOW_LOG << "输出: ----  ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+
+              SHADOW_LOG << "输出image: ----  ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+              SHADOW_LOG << " conv_basic beigin ..... ";
+
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+
+#ifdef PRINT_RESULT
+              for (int i = 0; i < out_dim.production(); i++) {
+                VLOG(4) << "output_v[" << i << "]:" << output_v[i]
+                        << " out_ref_data[" << i << "]:" << out_ref_data[i];
+              }
+#endif
+
+              for (int i = 0; i < out_dim.production(); i++) {
+                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                  LOG(FATAL) << "error idx:" << i;
+                }
+              }
+
+#ifdef LOOP_TEST
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+
 // #define PRINT_RESULT
 // #define LOOP_TEST
 TEST(conv2d, compute_image2d_5x5) {
@@ -537,6 +904,12 @@ TEST(conv2d, compute_image2d_5x5) {
              // element wise bias
              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};

+              VLOG(2) << "input_dim:" << input_dim
+                      << " filter_dim:" << filter_dim << " out_dim:" << out_dim
+                      << " bias_flag:" << bias_flag << " bias_dim:" << bias_dim
+                      << " group:" << group << " stride:" << stride
+                      << " pad:" << pad << " dilation:" << dilation;
+
              param.x->Resize(input_dim);
              param.filter->Resize(filter_dim);
              param.output->Resize(out_dim);

--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
@@ -123,420 +123,6 @@ class DepthwiseConv2dCompute
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

-class DepthwiseConv2dComputeFP16Image
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-
-  std::string doc() const override {
-    return "DepthwiseConv2d using cl::Image2D/kImageDefault, kFP16";
-  }
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    if (param.fuse_relu) {
-      build_options_ += " -DRELU";
-    } else if (param.activation_param.active_type ==
-               lite_api::ActivationType::kRelu6) {
-      build_options_ += " -DRELU6";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto x_dims = param.x->dims();
-    auto filter_dims = param.filter->dims();
-    auto output_dims = param.output->dims();
-    auto paddings = *param.paddings;
-    auto strides = param.strides;
-    auto dilations = *param.dilations;
-    int offset = filter_dims[2] / 2 - paddings[0];
-    int input_c_block = (x_dims[1] + 3) / 4;
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* input_img = param.x->data<int16_t, cl::Image2D>();
-    auto* filter_img = param.filter->data<int16_t, cl::Image2D>();
-
-    auto* bias_img = param.bias == nullptr
-                         ? static_cast<cl::Image2D*>(nullptr)
-                         : param.bias->data<int16_t, cl::Image2D>();
-
-    auto image_shape = InitImageDimInfoWith(output_dims);
-
-    auto* output_img = param.output->mutable_data<int16_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int c_block = (output_dims[1] + 3) / 4;
-    int w = output_dims[3];
-    int nh = output_dims[0] * output_dims[2];
-    auto global_work_size = cl::NDRange(c_block, w, nh);
-
-    VLOG(4) << "setArg";
-    VLOG(4) << "c_block = " << c_block;
-    VLOG(4) << "w = " << w;
-    VLOG(4) << "nh = " << nh;
-
-    VLOG(4) << "strides = " << strides[0];
-    VLOG(4) << "offset = " << offset;
-    VLOG(4) << "dilations = " << dilations[0];
-    VLOG(4) << "input_c_block = " << input_c_block;
-    VLOG(4) << "x_dims[3] = " << x_dims[3];
-    VLOG(4) << "x_dims[2] = " << x_dims[2];
-    VLOG(4) << "output_dims[3] = " << output_dims[3];
-    VLOG(4) << "output_dims[2] = " << output_dims[2];
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(w));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *filter_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *output_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
-    CL_CHECK_FATAL(status);
-
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_img, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"depth_conv2d_3x3"};
-  std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class DepthwiseConv2d3x3s1ComputeFP16Image
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-
-  std::string doc() const override {
-    return "DepthwiseConv2d3x3s1 using cl::Image2D/kImageDefault, kFP16";
-  }
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    if (param.fuse_relu) {
-      build_options_ += " -DRELU";
-    } else if (param.activation_param.active_type ==
-               lite_api::ActivationType::kRelu6) {
-      build_options_ += " -DRELU6";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto x_dims = param.x->dims();
-    auto filter_dims = param.filter->dims();
-    auto output_dims = param.output->dims();
-    auto paddings = *param.paddings;
-    auto strides = param.strides;
-    auto dilations = *param.dilations;
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* input_img = param.x->data<int16_t, cl::Image2D>();
-    auto* filter_img = param.filter->data<int16_t, cl::Image2D>();
-
-    auto* bias_img = param.bias == nullptr
-                         ? static_cast<cl::Image2D*>(nullptr)
-                         : param.bias->data<int16_t, cl::Image2D>();
-
-    auto image_shape = InitImageDimInfoWith(output_dims);
-
-    auto* output_img = param.output->mutable_data<int16_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int c_block = (output_dims[1] + 3) / 4;
-    int w = output_dims[3];
-    int nh = output_dims[0] * output_dims[2];
-
-    int w_blk_size = 2;
-    int w_blk = (w + w_blk_size - 1) / w_blk_size;
-
-    auto global_work_size = cl::NDRange(c_block, w_blk, nh);
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *filter_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *output_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
-    CL_CHECK_FATAL(status);
-
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_img, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"depth_conv2d_3x3s1"};
-  std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class DepthwiseConv2dBasicComputeFP32Image
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-
-  std::string doc() const override {
-    return "DepthwiseConv2d basic using cl::Image2D/kImageDefault, kFloat32";
-  }
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const bool has_bias = param.bias != nullptr;
-    const bool is_element_wise_bias =
-        has_bias && param.output->dims() == param.bias->dims();
-    if (param.fuse_relu) {
-      build_options_ += " -DRELU";
-    } else if (param.activation_param.active_type ==
-               lite_api::ActivationType::kRelu6) {
-      build_options_ += " -DRELU6";
-    }
-    if (has_bias) {
-      build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(kernel_func_name_,
-                                    "image/depthwise_conv2d_basic_kernel.cl",
-                                    build_options_);
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto input_dims = param.x->dims();
-    auto paddings = *param.paddings;
-    auto strides = param.strides;
-    auto* input_image = param.x->data<float, cl::Image2D>();
-    auto* filter_image = param.filter->data<float, cl::Image2D>();
-    auto filter_dims = param.filter->dims();
-    auto output_dims = param.output->dims();
-
-    int input_width = input_dims[3];
-    int input_height = input_dims[2];
-    int output_width = output_dims[3];
-    int output_height = output_dims[2];
-    int filter_width = filter_dims[3];
-    int filter_height = filter_dims[2];
-    auto out_image_shape = InitImageDimInfoWith(output_dims);
-    auto* out_image = param.output->mutable_data<float, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-
-    const bool has_bias = param.bias != nullptr;
-    const bool is_element_wise_bias =
-        has_bias && param.output->dims() == param.bias->dims();
-    int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-                 static_cast<int>(paddings[0]);
-
-    // calc input_c_block
-    auto input_image_shape = InitImageDimInfoWith(input_dims);
-    int input_c_block = input_image_shape["width"] / input_dims[3];
-    int input_c = input_dims[1];
-    auto dilations = *param.dilations;
-
-    const std::vector<size_t>& default_work_size =
-        DefaultWorkSize(output_dims,
-                        DDim(std::vector<DDim::value_type>{
-                            static_cast<int64_t>(out_image_shape["width"]),
-                            static_cast<int64_t>(out_image_shape["height"])}));
-
-    int c_block = default_work_size[0];
-    int w = default_work_size[1];
-    int nh = default_work_size[2];
-
-    VLOG(4) << "============ depthwise conv2d params ============";
-    VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-            << input_image_shape["height"];
-    VLOG(4) << "input_c_block: " << input_c_block;
-    VLOG(4) << "input_c: " << input_c;
-    VLOG(4) << "input_image: " << input_image;
-    VLOG(4) << "filter_dims: " << filter_dims;
-    VLOG(4) << "filter_image: " << filter_image;
-    VLOG(4) << "output_dims: " << output_dims;
-    VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-            << out_image_shape["height"];
-    VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-    VLOG(4) << "has bias: " << has_bias;
-    VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-    VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-    VLOG(4) << "offset: " << offset;
-    VLOG(4) << "dilations.size : " << dilations.size();
-    VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-    VLOG(4) << "default work size{c_block, w, nh}: "
-            << "{" << c_block << ", " << w << ", " << nh << ""
-            << "}";
-
-    CHECK_GE(dilations.size(), 2);
-    CHECK(dilations[0] == dilations[1]);
-    CHECK_GE(input_dims.size(), 4);
-    CHECK_GE(paddings.size(), 2);
-    CHECK(paddings[0] == paddings[1]);
-    CHECK_GE(strides.size(), 2);
-    CHECK(strides[0] == strides[1]);
-
-    // handle bias  use buffer for channel wise , use image for element wise
-    const cl::Buffer* bias_buf = nullptr;
-    const cl::Image2D* bias_image = nullptr;
-    if (has_bias) {
-      bias_image = param.bias->data<float, cl::Image2D>();
-    }
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    VLOG(4) << "kernel_key: " << kernel_key.str();
-    VLOG(4) << "kernel ready ... " << kernel_key.str();
-    VLOG(4) << "w: " << w;
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, c_block);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, w);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, nh);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_image);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *filter_image);
-    CL_CHECK_FATAL(status);
-    if (has_bias) {
-      VLOG(4) << "set bias_image: ";
-      status = kernel.setArg(++arg_idx, *bias_image);
-      CL_CHECK_FATAL(status);
-    }
-    status = kernel.setArg(++arg_idx, *out_image);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, strides[0]);
-    CL_CHECK_FATAL(status);
-
-    status = kernel.setArg(++arg_idx, offset);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_c_block);
-    CL_CHECK_FATAL(status);
-
-    status = kernel.setArg(++arg_idx, dilations[0]);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_height);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, output_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, output_height);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, filter_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, filter_height);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size =
-        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                    static_cast<size_t>(default_work_size.data()[1]),
-                    static_cast<size_t>(default_work_size.data()[2])};
-
-    VLOG(4) << "out_image: " << out_image;
-    VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-            << global_work_size[1] << "," << global_work_size[2] << "}";
-
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_image, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"depth_conv2d"};
-  std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
@@ -553,52 +139,3 @@ REGISTER_LITE_KERNEL(depthwise_conv2d,
    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    depthwise_conv2d,
-    kOpenCL,
-    kFP16,
-    kImageDefault,
-    paddle::lite::kernels::opencl::DepthwiseConv2dComputeFP16Image,
-    image2d)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Bias",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageNW))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    depthwise_conv2d_basic,
-    kOpenCL,
-    kFloat,
-    kImageDefault,
-    paddle::lite::kernels::opencl::DepthwiseConv2dBasicComputeFP32Image,
-    image2d)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Bias",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageNW))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
@@ -177,135 +177,7 @@ TEST(depthwise_conv2d_buffer_fp32, compute) {
  TargetWrapperCL::Unmap(input_data, mapped_input);
 }

-TEST(depthwise_conv2d_image2d_fp16, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
-                                                 TARGET(kOpenCL),
-                                                 PRECISION(kFP16),
-                                                 DATALAYOUT(kImageDefault));
-  ASSERT_FALSE(kernels.empty());
-
-  auto kernel = std::move(kernels.front());
-
-  LOG(INFO) << "get kernel";
-  lite::Tensor input, filter, output;
-  operators::ConvParam param;
-  param.x = &input;
-  param.filter = &filter;
-  param.output = &output;
-  std::vector<int> paddings = {0, 0};
-  param.paddings = std::make_shared<std::vector<int>>(paddings);
-  param.strides = std::vector<int>{1, 1};
-  std::vector<int> dilations = {1, 1};
-  param.dilations = std::make_shared<std::vector<int>>(dilations);
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> dep_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(dep_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(dep_context));
-
-  LOG(INFO) << "kernel ready";
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> gen(-5, 5);
-  std::vector<float> input_v(1 * 32 * 112 * 112);
-  std::vector<float> filter_v(32 * 1 * 3 * 3);
-  for (auto& i : input_v) {
-    i = gen(engine);
-  }
-  for (auto& f : filter_v) {
-    f = gen(engine);
-  }
-
-  LOG(INFO) << "prepare input";
-  input.Resize({1, 32, 112, 112});
-  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
-  DDim input_image_shape =
-      default_converter->InitImageDimInfoWith(input.dims());
-  LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
-            << input_image_shape[1];
-  std::vector<float> input_image_data(input_image_shape.production() *
-                                      4);  // 4 : RGBA
-  default_converter->NCHWToImage(
-      input_v.data(), input_image_data.data(), input.dims());
-  auto* input_image = input.mutable_data<int16_t, cl::Image2D>(
-      input_image_shape[0], input_image_shape[1], input_image_data.data());
-
-  LOG(INFO) << "prepare kernel";
-  filter.Resize({32, 1, 3, 3});
-  CLImageConverterNWBlock* nw_converter = new CLImageConverterNWBlock();
-  DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
-  LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
-            << filter_image_shape[1];
-  std::vector<float> filter_image_data(filter_image_shape.production() *
-                                       4);  // 4 : RGBA
-  nw_converter->NCHWToImage(
-      filter_v.data(), filter_image_data.data(), filter.dims());
-  auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>(
-      filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
-
-  LOG(INFO) << "launch";
-  output.Resize({1, 32, 110, 110});
-  DDim output_image_shape =
-      default_converter->InitImageDimInfoWith(output.dims());
-  LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
-            << output_image_shape[1];
-  auto* output_image = output.mutable_data<int16_t, cl::Image2D>(
-      output_image_shape[0], output_image_shape[1]);
-
-  kernel->Launch();
-
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<int16_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-    LOG(INFO) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  lite::Tensor output_ref;
-  output_ref.Resize({1, 32, 110, 110});
-  auto* output_ref_data = output_ref.mutable_data<float>(TARGET(kARM));
-  depth_conv<float, 1, 1>(input_v.data(),
-                          input.dims(),
-                          filter_v.data(),
-                          filter.dims(),
-                          output_ref_data,
-                          output_ref.dims());
-
-  const size_t cl_image2d_row_pitch{0};
-  const size_t cl_image2d_slice_pitch{0};
-
-  float* output_image_data = new float[output_image_shape.production() * 4];
-  TargetWrapperCL::ImgcpySync(output_image_data,
-                              output_image,
-                              output_image_shape[0],
-                              output_image_shape[1],
-                              cl_image2d_row_pitch,
-                              cl_image2d_slice_pitch,
-                              IoDirection::DtoH);
-
-  float* output_data = new float[output_image_shape.production() * 4];
-  default_converter->ImageToNCHW(
-      output_image_data, output_data, output_image_shape, output.dims());
-
-  LOG(INFO) << "output_data vs output_ref_data";
-  for (int i = 0; i < output.dims().production(); i++) {
-    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
-    LOG(INFO) << output_data[i] << " " << output_ref_data[i];
-  }
-}
-
 }  // namespace lite
 }  // namespace paddle

 USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFP16, kImageDefault, image2d);
--- a/lite/kernels/opencl/depthwise_conv2d_basic_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_basic_compute_test.cc
@@ -142,7 +142,7 @@ TEST(depthwise_conv2d_basic, compute) {

          VLOG(4) << "to get kernel ...";
          auto kernels =
-              KernelRegistry::Global().Create("depthwise_conv2d_basic",
+              KernelRegistry::Global().Create("depthwise_conv2d",
                                              TARGET(kOpenCL),
                                              PRECISION(kFloat),
                                              DATALAYOUT(kImageDefault));
@@ -383,7 +383,133 @@ TEST(depthwise_conv2d_basic, compute) {
 #endif
 }

+TEST(depthwise_conv2d_image2d_fp16, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFloat),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel";
+  lite::Tensor input, filter, output;
+  operators::ConvParam param;
+  param.x = &input;
+  param.filter = &filter;
+  param.output = &output;
+  std::vector<int> paddings = {0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.strides = std::vector<int>{1, 1};
+  std::vector<int> dilations = {1, 1};
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> dep_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(dep_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(dep_context));
+
+  LOG(INFO) << "kernel ready";
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> gen(-5, 5);
+  std::vector<float> input_v(1 * 32 * 112 * 112);
+  std::vector<float> filter_v(32 * 1 * 3 * 3);
+  for (auto& i : input_v) {
+    i = gen(engine);
+  }
+  for (auto& f : filter_v) {
+    f = gen(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  input.Resize({1, 32, 112, 112});
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim input_image_shape =
+      default_converter->InitImageDimInfoWith(input.dims());
+  LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
+            << input_image_shape[1];
+  std::vector<float> input_image_data(input_image_shape.production() *
+                                      4);  // 4 : RGBA
+  default_converter->NCHWToImage(
+      input_v.data(), input_image_data.data(), input.dims());
+  auto* input_image = input.mutable_data<int16_t, cl::Image2D>(
+      input_image_shape[0], input_image_shape[1], input_image_data.data());
+
+  LOG(INFO) << "prepare kernel";
+  filter.Resize({32, 1, 3, 3});
+  CLImageConverterNWBlock* nw_converter = new CLImageConverterNWBlock();
+  DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
+  LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
+            << filter_image_shape[1];
+  std::vector<float> filter_image_data(filter_image_shape.production() *
+                                       4);  // 4 : RGBA
+  nw_converter->NCHWToImage(
+      filter_v.data(), filter_image_data.data(), filter.dims());
+  auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>(
+      filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
+
+  LOG(INFO) << "launch";
+  output.Resize({1, 32, 110, 110});
+  DDim output_image_shape =
+      default_converter->InitImageDimInfoWith(output.dims());
+  LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
+            << output_image_shape[1];
+  auto* output_image = output.mutable_data<int16_t, cl::Image2D>(
+      output_image_shape[0], output_image_shape[1]);
+
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<int16_t, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+    LOG(INFO) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  lite::Tensor output_ref;
+  output_ref.Resize({1, 32, 110, 110});
+  auto* output_ref_data = output_ref.mutable_data<float>(TARGET(kARM));
+  depth_conv<float, 1, 1>(input_v.data(),
+                          input.dims(),
+                          filter_v.data(),
+                          filter.dims(),
+                          output_ref_data,
+                          output_ref.dims());
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  float* output_image_data = new float[output_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(output_image_data,
+                              output_image,
+                              output_image_shape[0],
+                              output_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+
+  float* output_data = new float[output_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      output_image_data, output_data, output_image_shape, output.dims());
+
+  LOG(INFO) << "output_data vs output_ref_data";
+  for (int i = 0; i < output.dims().production(); i++) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+    LOG(INFO) << output_data[i] << " " << output_ref_data[i];
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_KERNEL(
-    depthwise_conv2d_basic, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kImageDefault, image2d);
--- a/lite/kernels/opencl/fc_compute_test.cc
+++ b/lite/kernels/opencl/fc_compute_test.cc
@@ -66,6 +66,8 @@ void PrintData(std::string name, float* a, const int rows, const int cols) {
  }
 }

+// buffer
+#if 0  // fc_buffer
 // #define PRINT_RESULT
 #define LOOP_TEST
 TEST(fc, compute) {
@@ -193,8 +195,9 @@ TEST(fc, compute) {
  }      // m
 #endif
 }
+#endif  // fc_buffer

 }  // namespace lite
 }  // namespace paddle

-USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
--- a/lite/kernels/opencl/layout_compute.cc
+++ b/lite/kernels/opencl/layout_compute.cc
@@ -185,7 +185,7 @@ class LayoutComputeImageDefaultToBufferChw
    CL_CHECK_FATAL(status);
    status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
+    status = kernel.setArg(++arg_idx, static_cast<const int>(size_block));
    CL_CHECK_FATAL(status);
    status = kernel.setArg(++arg_idx, static_cast<const int>(size_batch));
    CL_CHECK_FATAL(status);

--- a/lite/kernels/opencl/layout_compute_test.cc
+++ b/lite/kernels/opencl/layout_compute_test.cc
@@ -34,10 +34,10 @@ TEST(layout_ImageDefault, compute) {
      for (int h = 1; h <= 100; h += 13) {
        for (int w = 1; w <= 100; w += 17) {
 #else
-  const int n = 1;
-  const int c = 1;
-  const int h = 1;
-  const int w = 100;
+  const int n = 2;
+  const int c = 9;
+  const int h = 20;
+  const int w = 5;
 #endif  // LOOP_TEST

          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
@@ -86,8 +86,7 @@ TEST(layout_ImageDefault, compute) {
          auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map(
              y_data, 0, sizeof(float) * x_dim.production()));
          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<int>(i);
-            mapped_y[i] = static_cast<int>(0);
+            mapped_x[i] = static_cast<float>(i);
          }

          // set context and kernel args
@@ -116,7 +115,7 @@ TEST(layout_ImageDefault, compute) {
 // result
 #ifdef PRINT_RESULT
          LOG(INFO) << "---- print result ----";
-          for (int eidx = 0; i < x_dim.production(); ++eidx) {
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
                      << std::endl;
          }
@@ -251,7 +250,7 @@ TEST(layout_ImageNW, compute) {
 // result
 #ifdef PRINT_RESULT
          LOG(INFO) << "---- print result ----";
-          for (int eidx = 0; i < x_dim.production(); ++eidx) {
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
                      << std::endl;
          }

--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
@@ -229,15 +229,15 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(pool2d,
-                     kOpenCL,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::PoolCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
+// REGISTER_LITE_KERNEL(pool2d,
+//                      kOpenCL,
+//                      kFloat,
+//                      kNCHW,
+//                      paddle::lite::kernels::opencl::PoolCompute,
+//                      def)
+//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .Finalize();

 REGISTER_LITE_KERNEL(pool2d,
                     kOpenCL,

--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
@@ -73,6 +73,8 @@ void pool_avg(const int padding_height,
  }
 }

+// buffer
+#if 0   // pool_buffer
 TEST(pool2d_buffer_fp32, compute) {
  LOG(INFO) << "to get kernel ...";
  auto kernels = KernelRegistry::Global().Create(
@@ -141,6 +143,7 @@ TEST(pool2d_buffer_fp32, compute) {
  }
  TargetWrapperCL::Unmap(out_data, mapped_out);
 }
+#endif  // pool_buffer

 TEST(pool2d_image2d_fp32, compute) {
  LOG(INFO) << "to get kernel ...";
@@ -239,5 +242,5 @@ TEST(pool2d_image2d_fp32, compute) {
 }  // namespace lite
 }  // namespace paddle

-USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
 USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kImageDefault, image2d);
--- a/lite/kernels/opencl/scale_compute.cc
+++ b/lite/kernels/opencl/scale_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ScaleParam;
+
+  std::string doc() const override { return "Scale using cl::Image2D, kFloat"; }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/scale_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    auto* x_img = param.x->data<float, cl::Image2D>();
+    const float scale = param.scale;
+    const float bias = param.bias;
+
+    LOG(INFO) << "x_image" << x_img;
+    auto out_image_shape = InitImageDimInfoWith(in_dims);
+    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
+              << out_image_shape["height"];
+    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+    LOG(INFO) << "out_image" << out_img;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
+                    static_cast<cl::size_type>(out_image_shape["height"])};
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, scale);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, bias);
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"scale"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(scale,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ScaleComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/scale_compute_test.cc
+++ b/lite/kernels/opencl/scale_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+void scale(const float* input_data,
+           const DDim& in_dim,
+           float* output_data,
+           const float scale,
+           const float bias) {
+  for (int i = 0; i < in_dim.production(); i++) {
+    output_data[i] = input_data[i] * scale + bias;
+  }
+}
+
+TEST(scale_image2d_fp32, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "scale", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel:" << kernel->doc();
+
+  lite::Tensor x, out;
+  operators::ScaleParam param;
+  param.x = &x;
+  param.output = &out;
+  param.scale = 1.5f;
+  param.bias = 0.3f;
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> scale_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(scale_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(scale_context));
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 11, 107, 107});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 11, 107, 107});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  std::vector<float> input_v(4 * 11 * 107 * 107);
+  for (auto& i : input_v) {
+    i = dist(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1];
+  std::vector<float> x_image_data(image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<float, cl::Image2D>(
+      image_shape[0], image_shape[1], x_image_data.data());
+  LOG(INFO) << "x_image:" << x_image;
+
+  auto* out_image =
+      out.mutable_data<float, cl::Image2D>(image_shape[0], image_shape[1]);
+  LOG(INFO) << "out_image:" << out_image;
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+  scale(input_v.data(), in_dim, out_ref.get(), 1.5f, 0.3f);
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  float* out_image_data = new float[image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              image_shape[0],
+                              image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, image_shape, out_dim);
+
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(scale, kOpenCL, kFloat, kImageDefault, image2d);
--- a/lite/kernels/opencl/sigmoid_compute.cc
+++ b/lite/kernels/opencl/sigmoid_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class SigmoidCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Sigmoid using cl::Buffer, kFloat";
+  }
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    size_t count = x_dims.production();
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* x_buf = param.X->data<float, cl::Buffer>();
+    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, (const int)count);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size = cl::NDRange{count};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_buf, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"sigmoid"};
+  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class SigmoidComputeFloatImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFloat";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf = param.X->data<float, cl::Image2D>();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"sigmoid"};
+  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class SigmoidComputeFP16ImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf =
+        param.X->data<int16_t,
+                      cl::Image2D>();  // use int16_t represents half float
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf =
+        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
+                                                        // represents half float
+            image_shape["width"],
+            image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"sigmoid"};
+  std::string build_options_{"-DCL_DTYPE_half -DSIGMOID"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// REGISTER_LITE_KERNEL(sigmoid,
+//                      kOpenCL,
+//                      kFloat,
+//                      kNCHW,
+//                      paddle::lite::kernels::opencl::SigmoidCompute,
+//                      def)
+//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sigmoid,
+    kOpenCL,
+    kFloat,
+    kImageDefault,
+    paddle::lite::kernels::opencl::SigmoidComputeFloatImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sigmoid,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::SigmoidComputeFP16ImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/sigmoid_compute_test.cc
+++ b/lite/kernels/opencl/sigmoid_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void sigmoid_compute_ref(const dtype *x_data,
+                         const DDim &x_dim,
+                         dtype *out_data) {
+  for (int i = 0; i < x_dim.production(); ++i) {
+    out_data[i] = 1 / (1 + expf(-x_data[i]));
+  }
+}
+
+// buffer
+#if 0   // sigmoid_buffer
+TEST(opencl_sigmoid_buffer, compute) {
+  // prepare data
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
+  lite::Tensor x, out;
+  x.Resize(x_dim);
+  out.Resize(x_dim);
+
+  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x = static_cast<float *>(
+      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    mapped_x[i] = dist(engine);
+  }
+
+  // set param and kernel, then run
+  operators::ActivationParam param;
+  param.X = &x;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(sigmoid_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(sigmoid_context));
+
+  kernel->Launch();
+
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.Out->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  // run compute ref and check
+  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
+  sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
+
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x_data, mapped_x);
+}
+#endif  // sigmoid_buffer
+
+#define LOOP_TEST
+// #define PRINT_RESULT
+TEST(sigmoid_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
+               "layout(img2buf) "
+               "-> host";
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 9; n += 3) {
+    for (auto c : {1, 3, 9}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 3;
+  const int c = 9;
+  const int h = 51;
+  const int w = 11;
+#endif  // LOOP_TEST
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto sigmoid_img_kernels =
+              KernelRegistry::Global().Create("sigmoid",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(sigmoid_img_kernels.empty());
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> sigmoid_in
+          // sigmoid(img): sigmoid_in -> sigmoid_out
+          // layout(img->buf): sigmoid_out -> y
+          lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &sigmoid_in;
+          ImageToBufferParam.x = &sigmoid_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam SigmoidParam;
+          SigmoidParam.X = &sigmoid_in;
+          SigmoidParam.Out = &sigmoid_out;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          sigmoid_in.Resize(x_dim);
+          sigmoid_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto sigmoid_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> dist(-1, 1);
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<float>(dist(engine));
+          }
+          auto *sigmoid_in_data = sigmoid_in.mutable_data<float, cl::Image2D>(
+              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
+          auto *sigmoid_out_data = sigmoid_out.mutable_data<float, cl::Image2D>(
+              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          sigmoid_img_kernel->SetParam(SigmoidParam);
+          std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(sigmoid_img_context->As<OpenCLContext>()));
+          sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: relu_img_kernel";
+          sigmoid_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
+// result
+#ifdef PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]:" << mapped_y[eidx] << ", mapped_x["
+                        << eidx << "]: " << mapped_x[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+#define SIGMOID_FP16_LOOP_TEST
+// #define SIGMOID_FP16_PRINT_RESULT
+TEST(sigmoid_image2d_fp16, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef SIGMOID_FP16_LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+#endif  // SIGMOID_FP16_LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto sigmoid_img_kernels =
+              KernelRegistry::Global().Create("sigmoid",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFP16),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(sigmoid_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> sigmoid_in
+          // sigmoid(img): sigmoid_in -> sigmoid_out
+          // layout(img->buf): sigmoid_out -> y
+          lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &sigmoid_in;
+          ImageToBufferParam.x = &sigmoid_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam SigmoidParam;
+          SigmoidParam.X = &sigmoid_in;
+          SigmoidParam.Out = &sigmoid_out;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          sigmoid_in.Resize(x_dim);
+          sigmoid_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto sigmoid_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> dist(-1, 1);
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<float>(dist(engine));
+          }
+          auto *sigmoid_in_data = sigmoid_in.mutable_data<int16_t, cl::Image2D>(
+              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
+          auto *sigmoid_out_data =
+              sigmoid_out.mutable_data<int16_t, cl::Image2D>(
+                  sigmoid_image2d_shape["width"],
+                  sigmoid_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          sigmoid_img_kernel->SetParam(SigmoidParam);
+          std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(sigmoid_img_context->As<OpenCLContext>()));
+          sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: sigmoid_img_kernel";
+          sigmoid_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
+// result
+#ifdef SIGMOID_FP16_PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // SIGMOID_FP16_PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]: " << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]: " << mapped_y[eidx] << ", mapped_x["
+                        << eidx << "]: " << mapped_x[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef SIGMOID_FP16_LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+// sigmoid buffer
+// USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
+
+// sigmoid image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kImageDefault, ImageDefault);
+
+// sigmoid image2d fp16
+USE_LITE_KERNEL(sigmoid, kOpenCL, kFP16, kImageDefault, ImageDefault);
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -222,14 +222,6 @@ class GeluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 };

 // softsign(x) = x / (1 + |x|)
-template <typename T>
-struct SoftsignFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
-    out.device(d) = x / (static_cast<T>(1) + x.abs());
-  }
-};
-
 template <typename T>
 class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 public:
@@ -238,9 +230,13 @@ class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  void Run() override {
    // auto& context = ctx_->As<X86Context>();
    auto& param = *param_.get_mutable<operators::ActivationParam>();
-    param.Out->template mutable_data<T>();

-    Activate<SoftsignFunctor<T>>(param.X, param.Out);
+    const T* x_data = param.X->data<T>();
+    T* out_data = param.Out->mutable_data<T>();
+    size_t x_size = param.X->numel();
+    for (size_t i = 0; i < x_size; i++) {
+      out_data[i] = x_data[i] / (static_cast<T>(1) + std::abs(x_data[i]));
+    }
  }

  virtual ~SoftsignCompute() = default;

--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
@@ -48,6 +48,10 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context,
  row_shuffle(context, src, index_lod, dst, indexed_src);
 }

+static inline int64_t CalculateSeqWidth(const DDim& dims) {
+  return dims.count(1, dims.size());
+}
+
 template <typename T>
 class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 public:
@@ -65,15 +69,16 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto* bias = param.bias;

    auto* batch_gate = param.batch_gate;
-    batch_gate->mutable_data<T>();
    auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
-    batch_reset_hidden_prev->mutable_data<T>();
    auto* batch_hidden = param.batch_hidden;
-    batch_hidden->mutable_data<T>();
+    T* batch_gate_ptr = batch_gate->mutable_data<T>();
+    T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data<T>();
+    T* batch_hidden_ptr = batch_hidden->mutable_data<T>();
+
    auto* hidden = param.hidden;
    hidden->mutable_data<T>();

-    auto hidden_dims = hidden->dims();
+    const auto& hidden_dims = hidden->dims();

    lite::x86::math::LoDTensor2BatchFunctor<TARGET(kX86), T> to_batch;
    to_batch(context, *input, batch_gate, true, is_reverse);
@@ -90,19 +95,23 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
    Tensor ordered_h0;

-    std::vector<size_t> order(batch_gate->lod()[2]);
-
    if (h0) {
      // Since the batch computing for GRU reorders the input sequences
      // according to their length. The initialized cell state also needs
      // to reorder.
+      const std::vector<size_t>& order(batch_gate->lod()[2]);
      ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
      gru_value.prev_out_value = ordered_h0.mutable_data<T>();
    } else {
      gru_value.prev_out_value = nullptr;
    }
-    auto batch_starts = batch_gate->lod()[0];
+
+    const auto& batch_starts = batch_gate->lod()[0];
    size_t seq_len = batch_starts.size() - 1;
+    int64_t batch_gate_width = CalculateSeqWidth(batch_gate->dims());
+    int64_t batch_reset_hidden_prev_width =
+        CalculateSeqWidth(batch_reset_hidden_prev->dims());
+    int64_t batch_hidden_width = CalculateSeqWidth(batch_hidden->dims());
    auto active_node =
        lite::x86::math::detail::GetActivationType(param.activation);
    auto active_gate =
@@ -145,13 +154,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
        int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
        int64_t cur_batch_size = bend - bstart;

-        Tensor gate_t = batch_gate->Slice<T>(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice<T>(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend);
-        gru_value.output_value = hidden_t.mutable_data<T>();
-        gru_value.gate_value = gate_t.mutable_data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
+        gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width;
+        gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width;
+        gru_value.reset_output_value = batch_reset_hidden_prev_ptr +
+                                       bstart * batch_reset_hidden_prev_width;

        if (gru_value.prev_out_value) {
          blas.GEMM_COMPUTE(CblasNoTrans,
@@ -188,13 +194,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
        int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
        int64_t cur_batch_size = bend - bstart;

-        Tensor gate_t = batch_gate->Slice<T>(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice<T>(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend);
-        gru_value.output_value = hidden_t.mutable_data<T>();
-        gru_value.gate_value = gate_t.mutable_data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
+        gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width;
+        gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width;
+        gru_value.reset_output_value = batch_reset_hidden_prev_ptr +
+                                       bstart * batch_reset_hidden_prev_width;

        lite::x86::math::GRUUnitFunctor<TARGET(kX86), T>::compute(
            context,

--- a/lite/kernels/x86/reduce_op_function.h
+++ b/lite/kernels/x86/reduce_op_function.h
@@ -63,7 +63,19 @@ void ReduceFunctor(const lite::Tensor& input,
    auto out = EigenScalar<T>::From(output);
    functor(&x, &out, reduce_dim);
  } else {
-    auto out = EigenTensor<T, (D - R_D)>::From(*output, output->dims());
+    std::vector<DDim::value_type> out_dims;
+    if (keep_dim) {
+      // Construct the squeezed dims.
+      const int kDelFlag = -2;
+      out_dims = output->dims().Vectorize();
+      for (size_t i = 0; i < dims.size(); ++i) {
+        out_dims[reduce_dim[i]] = kDelFlag;
+      }
+      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                     out_dims.end());
+    }
+    auto out = EigenTensor<T, (D - R_D)>::From(
+        *output, keep_dim ? DDim(out_dims) : output->dims());
    functor(&x, &out, reduce_dim);
  }
 }

--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
@@ -55,24 +55,33 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto& context = ctx_->As<X86Context>();
    CHECK(param.output);
    CHECK(param.x);
-    param.output->mutable_data<T>();
-    const int rank = param.x->dims().size();
+
+    auto* x = param.x;
+    auto* output = param.output;
+    output->mutable_data<T>();
+
+    const int rank = x->dims().size();
    const int axis = CanonicalAxis(param.axis, rank);
-    int axis_dim = param.x->dims()[axis];
-    const int n = SizeToAxis(axis, param.x->dims());
-    const int d = SizeFromAxis(axis, param.x->dims());
+    int axis_dim = x->dims()[axis];
+    if (rank == 2 && axis == 1) {
+      lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
+          context, axis_dim, x, output);
+    } else {
+      const int n = SizeToAxis(axis, x->dims());
+      const int d = SizeFromAxis(axis, x->dims());

-    DDim shape(std::vector<DDim::value_type>{n, d});
+      DDim x_dims = x->dims();
+      DDim out_dims = output->dims();

-    Tensor input_2d;
-    Tensor out_2d;
-    input_2d.ShareDataWith(*param.x);
-    input_2d.Resize(shape);
-    out_2d.ShareDataWith(*param.output);
-    out_2d.Resize(shape);
+      DDim shape_2d(std::vector<DDim::value_type>{n, d});
+      x->Resize(shape_2d);
+      output->Resize(shape_2d);

-    lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
-        context, axis_dim, &input_2d, &out_2d);
+      lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
+          context, axis_dim, x, output);
+      x->Resize(x_dims);
+      output->Resize(out_dims);
+    }
  }

  virtual ~SoftmaxCompute() = default;

--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
@@ -41,7 +41,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
-  auto axis = op_info->GetAttr<int>("axis");
+  int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;

  // X node
  std::shared_ptr<Node> x_node = nullptr;

--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -20,6 +20,7 @@
 #include "lite/core/scope.h"
 #include "lite/core/tensor.h"
 #include "lite/core/variable.h"
+#include "lite/core/version.h"
 #include "lite/model_parser/desc_apis.h"
 #include "lite/model_parser/naive_buffer/combined_params_desc.h"
 #include "lite/model_parser/naive_buffer/param_desc.h"
@@ -536,40 +537,56 @@ void SaveCombinedParamsNaive(const std::string &path,
  }

  pt_desc.Save();
-  table.SaveToFile(path);
+  table.AppendToFile(path);
 }

 void SaveModelNaive(const std::string &model_dir,
                    const Scope &exec_scope,
                    const cpp::ProgramDesc &cpp_prog,
                    bool combined) {
-  MkDirRecur(model_dir);
  // Save program
-  const std::string prog_path = model_dir + "/__model__.nb";
+  const std::string prog_path = model_dir + ".nb";
  naive_buffer::BinaryTable table;
  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
  TransformProgramDescCppToAny(cpp_prog, &nb_prog);
  nb_proto_prog.Save();
-  table.SaveToFile(prog_path);

+  // Save meta_version(uint16) into file
+  naive_buffer::BinaryTable meta_version_table;
+  meta_version_table.Require(sizeof(uint16_t));
+  uint16_t meta_version = 0;
+  memcpy(meta_version_table.cursor(), &meta_version, sizeof(uint16_t));
+  meta_version_table.Consume(sizeof(uint16_t));
+  meta_version_table.SaveToFile(prog_path);
+
+  // Save lite_version(char[16]) into file
+  const int paddle_version_length = 16 * sizeof(char);
+  naive_buffer::BinaryTable paddle_version_table;
+  paddle_version_table.Require(paddle_version_length);
+  std::string paddle_version = version();
+  memcpy(paddle_version_table.cursor(),
+         paddle_version.c_str(),
+         paddle_version_length);
+  paddle_version_table.Consume(paddle_version_length);
+  paddle_version_table.AppendToFile(prog_path);
+  VLOG(4) << "paddle_version:" << paddle_version;
+
+  // Save topology_size(uint64) into file
+  naive_buffer::BinaryTable topology_size_table;
+  topology_size_table.Require(sizeof(uint64_t));
+  uint64_t topology_size = table.size();
+  memcpy(topology_size_table.cursor(), &topology_size, sizeof(uint64_t));
+  topology_size_table.Consume(sizeof(uint64_t));
+  topology_size_table.AppendToFile(prog_path);
+
+  // save topology data into model file
+  table.AppendToFile(prog_path);
  // Save Params
-  // NOTE: Only main block be used now.
-  if (combined) {
-    const std::string combined_params_path = model_dir + "/param.nb";
-    SaveCombinedParamsNaive(combined_params_path, exec_scope, cpp_prog);
-  } else {
-    auto prog = cpp_prog;
-    auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
-    for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
-      auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
-      if (var.Name() == "feed" || var.Name() == "fetch" || !var.Persistable())
-        continue;
-      const std::string path = model_dir + "/" + var.Name() + ".nb";
-      SaveParamNaive(path, exec_scope, var.Name());
-    }
-  }
-  LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully";
+  SaveCombinedParamsNaive(prog_path, exec_scope, cpp_prog);
+
+  LOG(INFO) << "Save naive buffer model in '" << model_dir
+            << ".nb' successfully";
 }
 #endif

@@ -638,14 +655,15 @@ void LoadParamNaive(const std::string &path,
 }

 void LoadCombinedParamsNaive(const std::string &path,
+                             const uint64_t &offset,
                             lite::Scope *scope,
                             const cpp::ProgramDesc &cpp_prog,
                             bool params_from_memory) {
  naive_buffer::BinaryTable table;
  if (params_from_memory) {
-    table.LoadFromMemory(path.c_str(), path.length());
+    table.LoadFromMemory(path.c_str() + offset, path.length() - offset);
  } else {
-    table.LoadFromFile(path);
+    table.LoadFromFile(path, offset, 0);
  }
  naive_buffer::proto::CombinedParamsDesc pt_desc(&table);
  pt_desc.Load();
@@ -678,6 +696,13 @@ void LoadModelNaive(const std::string &model_dir,
  CHECK(scope);
  cpp_prog->ClearBlocks();

+  LOG(WARNING)
+      << "WARNING: MobileConfig::set_model_dir and "
+         "MobileConfig::set_model_buffer are deprecated APIs "
+         "and will be removed in latter release. \n"
+         "    MobileConfig::set_model_from_file(const std::string& model_file)"
+         " and MobileConfig::set_model_from_buffer(const std::string& "
+         "model_buffer) are recommended.";
  // Load model
  const std::string prog_path = model_dir + "/__model__.nb";
  naive_buffer::BinaryTable table;
@@ -693,7 +718,7 @@ void LoadModelNaive(const std::string &model_dir,
  // NOTE: Only main block be used now.
  if (combined) {
    const std::string combined_params_path = model_dir + "/param.nb";
-    LoadCombinedParamsNaive(combined_params_path, scope, *cpp_prog, false);
+    LoadCombinedParamsNaive(combined_params_path, 0, scope, *cpp_prog, false);
  } else {
    auto &prog = *cpp_prog;
    auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
@@ -718,6 +743,96 @@ void LoadModelNaive(const std::string &model_dir,
  VLOG(4) << "Load naive buffer model in '" << model_dir << "' successfully";
 }

+/*
+ * Binary structure of naive_buffer model: model.nb
+ * ----------------------------------------------------------
+ * |       |    PART         |   Precision |   Length(byte) |
+ * |   1   |  meta_version   |   uint16_t  |       2        |
+ * |   2   |  opt_version    |   char[16]  |      16        |
+ * |   3   |  topo_size      |   uint64_t  |       8        |
+ * |   4   |  topo_data      |   char[]    | topo_size byte |
+ * |   5   |  param_data     |   char[]    |                |
+ * ----------------------------------------------------------
+ *  Meaning of each part:
+ *      meta_version: meata_version, 0 default.
+ *      opt_version:  lite_version of opt tool that transformed this model.
+ *      topo_size:    length of `topo_data`.
+ *      topo_data:    contains model's topology data.
+ *      param_data:   contains model's params data.
+*/
+
+// usage: LoadModelNaiveFromFile is used for loading model from file.
+template <typename T>
+void ReadModelDataFromFile(T *data,
+                           const std::string &prog_path,
+                           uint64_t *offset,
+                           const uint64_t &size) {
+  naive_buffer::BinaryTable data_table;
+  data_table.LoadFromFile(prog_path, *offset, size);
+  memcpy(data, data_table.cursor(), size);
+  *offset = *offset + size;
+}
+
+void LoadModelNaiveFromFile(const std::string &filename,
+                            Scope *scope,
+                            cpp::ProgramDesc *cpp_prog) {
+  CHECK(cpp_prog);
+  CHECK(scope);
+  cpp_prog->ClearBlocks();
+  // ModelFile
+  const std::string prog_path = filename;
+
+  // Offset
+  uint64_t offset = 0;
+
+  // (1)get meta version
+  uint16_t meta_version;
+  ReadModelDataFromFile<uint16_t>(
+      &meta_version, prog_path, &offset, sizeof(uint16_t));
+  VLOG(4) << "Meta_version:" << meta_version;
+
+  // (2)get opt version
+  char opt_version[16];
+  const uint64_t opt_version_length = 16 * sizeof(char);
+  ReadModelDataFromFile<char>(
+      opt_version, prog_path, &offset, opt_version_length);
+  VLOG(4) << "Opt_version:" << opt_version;
+
+  // check version, opt's version should be consistent with current Paddle-Lite
+  // version.
+  const std::string paddle_version = version();
+  const std::string opt_version_str = opt_version;
+  if (paddle_version != opt_version_str) {
+    LOG(WARNING) << "warning: the version of opt that transformed this model "
+                    "is not consistent with current Paddle-Lite version."
+                    "\n      version of opt:"
+                 << opt_version
+                 << "\n      version of current Paddle-Lite:" << paddle_version;
+  }
+
+  // (3)get topo_size
+  uint64_t topo_size;
+  ReadModelDataFromFile<uint64_t>(
+      &topo_size, prog_path, &offset, sizeof(uint64_t));
+
+  // (4)get topo data
+  naive_buffer::BinaryTable topo_table;
+  topo_table.LoadFromFile(prog_path, offset, topo_size);
+  offset = offset + topo_size;
+  // transform topo_data into cpp::ProgramDesc
+  naive_buffer::proto::ProgramDesc nb_proto_prog(&topo_table);
+  nb_proto_prog.Load();
+  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
+  TransformProgramDescAnyToCpp(nb_prog, cpp_prog);
+
+  // (5)Load Params
+  LoadCombinedParamsNaive(prog_path, offset, scope, *cpp_prog, false);
+
+  VLOG(4) << "Load naive buffer model in '" << filename << "' successfully";
+}
+
+// warning: this is an old inference and is not suggested.
+// todo: this inference will be abandened in release/v3.0.0
 void LoadModelNaiveFromMemory(const std::string &model_buffer,
                              const std::string &param_buffer,
                              Scope *scope,
@@ -741,7 +856,64 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer,
  // Load Params
  // NOTE: Only main block be used now.
  // only combined Params are supported in Loading Model from memory
-  LoadCombinedParamsNaive(param_buffer, scope, *cpp_prog, true);
+  LoadCombinedParamsNaive(param_buffer, 0, scope, *cpp_prog, true);
+
+  VLOG(4) << "Load model from naive buffer memory successfully";
+}
+
+// usage: LoadModelNaiveFromMemory is used for loading naive model from memory
+template <typename T>
+void ReadModelDataFromBuffer(T *data,
+                             const std::string &model_buffer,
+                             uint64_t *offset,
+                             const uint64_t &size) {
+  naive_buffer::BinaryTable data_table;
+  data_table.LoadFromMemory(model_buffer.c_str() + *offset, size);
+  memcpy(data, data_table.cursor(), size);
+  *offset = *offset + size;
+}
+void LoadModelNaiveFromMemory(const std::string &model_buffer,
+                              Scope *scope,
+                              cpp::ProgramDesc *cpp_prog) {
+  CHECK(cpp_prog);
+  CHECK(scope);
+  cpp_prog->ClearBlocks();
+
+  // Offset
+  uint64_t offset = 0;
+
+  // (1)get meta version
+  uint16_t meta_version;
+  ReadModelDataFromBuffer<uint16_t>(
+      &meta_version, model_buffer, &offset, sizeof(uint16_t));
+  VLOG(4) << "Meta_version:" << meta_version;
+
+  // (2)get opt version
+  char opt_version[16];
+  const uint64_t paddle_version_length = 16 * sizeof(char);
+  ReadModelDataFromBuffer<char>(
+      opt_version, model_buffer, &offset, paddle_version_length);
+  VLOG(4) << "Opt_version:" << opt_version;
+
+  // (3)get topo_size and topo_data
+  uint64_t topo_size;
+  ReadModelDataFromBuffer<uint64_t>(
+      &topo_size, model_buffer, &offset, sizeof(uint64_t));
+  naive_buffer::BinaryTable table;
+  table.LoadFromMemory(model_buffer.c_str() + offset, topo_size);
+  offset = offset + topo_size;
+
+  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
+  nb_proto_prog.Load();
+  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
+
+  // Transform to cpp::ProgramDesc
+  TransformProgramDescAnyToCpp(nb_prog, cpp_prog);
+
+  // Load Params
+  // NOTE: Only main block be used now.
+  // only combined Params are supported in Loading Model from memory
+  LoadCombinedParamsNaive(model_buffer, offset, scope, *cpp_prog, true);

  VLOG(4) << "Load model from naive buffer memory successfully";
 }

--- a/lite/model_parser/model_parser.h
+++ b/lite/model_parser/model_parser.h
@@ -94,15 +94,22 @@ void LoadParamNaive(const std::string& path,
                    lite::Scope* scope,
                    const std::string& name);

+// warning:this old inference will be abandened in release/v3.0.0
+// and LoadModelNaiveFromFile is suggested.
 void LoadModelNaive(const std::string& model_dir,
                    lite::Scope* scope,
                    cpp::ProgramDesc* prog,
                    bool combined = true);
-
+void LoadModelNaiveFromFile(const std::string& filename,
+                            lite::Scope* scope,
+                            cpp::ProgramDesc* prog);
 void LoadModelNaiveFromMemory(const std::string& model_buffer,
                              const std::string& param_buffer,
                              lite::Scope* scope,
                              cpp::ProgramDesc* cpp_prog);
+void LoadModelNaiveFromMemory(const std::string& model_buffer,
+                              lite::Scope* scope,
+                              cpp::ProgramDesc* cpp_prog);

 }  // namespace lite
 }  // namespace paddle
--- a/lite/model_parser/model_parser_test.cc
+++ b/lite/model_parser/model_parser_test.cc
@@ -121,17 +121,23 @@ TEST(ModelParser, SaveModelNaive) {
  SaveModelNaive(save_pb_model_path, scope, prog);
 }

+TEST(ModelParser, LoadModelNaiveFromFile) {
+  CHECK(!FLAGS_model_dir.empty());
+  cpp::ProgramDesc prog;
+  Scope scope;
+
+  auto model_path = std::string(FLAGS_model_dir) + ".saved.naive.nb";
+  LoadModelNaiveFromFile(model_path, &scope, &prog);
+}
+
 TEST(ModelParser, LoadModelNaiveFromMemory) {
  CHECK(!FLAGS_model_dir.empty());
  cpp::ProgramDesc prog;
  Scope scope;

-  auto model_path = std::string(FLAGS_model_dir) + ".saved.naive/__model__.nb";
-  auto params_path = std::string(FLAGS_model_dir) + ".saved.naive/param.nb";
+  auto model_path = std::string(FLAGS_model_dir) + ".saved.naive.nb";
  std::string model_buffer = lite::ReadFile(model_path);
-  std::string params_buffer = lite::ReadFile(params_path);
-
-  LoadModelNaiveFromMemory(model_buffer, params_buffer, &scope, &prog);
+  LoadModelNaiveFromMemory(model_buffer, &scope, &prog);
 }

 }  // namespace lite

--- a/lite/model_parser/naive_buffer/naive_buffer.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer.cc
@@ -44,24 +44,37 @@ void BinaryTable::SaveToFile(const std::string &filename) const {
  fclose(fp);
 }

-void BinaryTable::LoadFromFile(const std::string &filename) {
-  // get file size
+void BinaryTable::AppendToFile(const std::string &filename) const {
+  FILE *fp = fopen(filename.c_str(), "ab");
+  CHECK(fp) << "Unable to open file: " << filename;
+  if (fwrite(reinterpret_cast<const char *>(data()), 1, size(), fp) != size()) {
+    fclose(fp);
+    LOG(FATAL) << "Write file error: " << filename;
+  }
+  fclose(fp);
+}
+
+void BinaryTable::LoadFromFile(const std::string &filename,
+                               const size_t &offset,
+                               const size_t &size) {
+  // open file in readonly mode
  FILE *fp = fopen(filename.c_str(), "rb");
  CHECK(fp) << "Unable to open file: " << filename;
-  fseek(fp, 0L, SEEK_END);
-  size_t file_size = ftell(fp);
-  LOG(INFO) << "file size " << file_size;
-
-  // load data.
-  fseek(fp, 0L, SEEK_SET);
-  Require(file_size);
-  if (fread(reinterpret_cast<char *>(&bytes_[0]), 1, file_size, fp) !=
-      file_size) {
+  // move fstream pointer backward for size of offset
+  size_t buffer_size = size;
+  if (size == 0) {
+    fseek(fp, 0L, SEEK_END);
+    buffer_size = ftell(fp) - offset;
+  }
+  fseek(fp, offset, SEEK_SET);
+  Require(buffer_size);
+  // read data of `size` into binary_data_variable:`bytes_`
+  if (fread(reinterpret_cast<char *>(&bytes_[0]), 1, buffer_size, fp) !=
+      buffer_size) {
    fclose(fp);
    LOG(FATAL) << "Read file error: " << filename;
  }
  fclose(fp);
-
  // Set readonly.
  is_mutable_mode_ = false;
 }

--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
@@ -61,8 +61,12 @@ struct BinaryTable {

  /// Serialize the table to a binary buffer.
  void SaveToFile(const std::string& filename) const;
+  void AppendToFile(const std::string& filename) const;

-  void LoadFromFile(const std::string& filename);
+  //  void LoadFromFile(const std::string& filename);
+  void LoadFromFile(const std::string& filename,
+                    const size_t& offset = 0,
+                    const size_t& size = 0);
  void LoadFromMemory(const char* buffer, size_t buffer_size);
 };


--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -68,7 +68,7 @@ add_operator(axpy_op extra SRCS axpy_op.cc DEPS ${op_DEPS})
 add_operator(gru_unit_op extra SRCS gru_unit_op.cc DEPS ${op_DEPS})
 add_operator(gru_op extra SRCS gru_op.cc DEPS ${op_DEPS})
 add_operator(layout_once_op extra SRCS layout_once_op.cc DEPS ${op_DEPS})
-add_operator(density_prior_box_op extra SRCS density_prior_box_op.cc DEPS ${op_DEPS})
+add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS})
 add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS})
 add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
 add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
@@ -79,7 +79,7 @@ add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEP
 add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS})
 add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS})
 add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS})
-add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
+add_operator(fake_quantize_range_abs_max_op basic SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})

--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
@@ -82,7 +82,20 @@ bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
  param_.variance =
      scope->FindVar(op_desc.Input("Variance").front())->GetMutable<Tensor>();
  param_.y = scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
-  param_.is_test = op_desc.GetAttr<int>("is_test");
+
+  auto is_test_type = op_desc.GetAttrType("is_test");
+  switch (is_test_type) {
+    case OpDescAPI::AttrType::INT:
+      param_.is_test = op_desc.GetAttr<int>("is_test");
+      break;
+    case OpDescAPI::AttrType::BOOLEAN:
+      param_.is_test = op_desc.GetAttr<bool>("is_test");
+      break;
+    default:
+      LOG(FATAL) << "Unsupported attribute type: the type of attribute "
+                    "`is_test` in BatchNormOP should be int or bool.";
+  }
+
  if (op_desc.HasAttr("use_global_stats")) {
    param_.use_global_stats = op_desc.GetAttr<bool>("use_global_stats");
  }

--- a/lite/operators/batch_norm_op_test.cc
+++ b/lite/operators/batch_norm_op_test.cc
@@ -46,7 +46,7 @@ TEST(batch_norm_op_lite, test) {
  desc.SetInput("Mean", {"mean"});
  desc.SetInput("Variance", {"variance"});
  desc.SetOutput("Y", {"y"});
-  desc.SetAttr("is_test", static_cast<int>(1));
+  desc.SetAttr("is_test", static_cast<bool>(true));
  desc.SetAttr("use_global_stats", false);
  desc.SetAttr("epsilon", 1e-5f);
  desc.SetAttr("momentum", 0.9f);
@@ -101,7 +101,7 @@ TEST(batch_norm_op_lite, test_enable_is_test) {
  desc.SetOutput("VarianceOut", {"variance_out"});
  desc.SetOutput("SavedMean", {"saved_mean"});
  desc.SetOutput("SavedVariance", {"saved_variance"});
-  desc.SetAttr("is_test", static_cast<int>(0));
+  desc.SetAttr("is_test", static_cast<bool>(false));
  desc.SetAttr("use_global_stats", false);
  desc.SetAttr("epsilon", 1e-5f);
  desc.SetAttr("momentum", 0.9f);

--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -52,12 +52,12 @@ inline int ConvOutputSize(int input_size,
  return output_size;
 }

-void UpdatePaddingAndDilation(std::vector<int>* paddings,
-                              std::vector<int>* dilations,
-                              const std::vector<int>& strides,
-                              const std::string padding_algorithm,
-                              const lite::DDim data_dims,
-                              const lite::DDim& ksize) {
+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const lite::DDim data_dims,
+                                     const lite::DDim& ksize) {
  // when padding_desc is "VALID" or "SAME"
  if (padding_algorithm == "SAME") {
    for (size_t i = 0; i < strides.size(); ++i) {

--- a/lite/operators/fake_quantize_range_abs_max.cc
+++ b/lite/operators/fake_quantize_range_abs_max.cc
@@ -23,3 +23,5 @@ namespace operators {}  // namespace operators

 REGISTER_LITE_OP(fake_quantize_range_abs_max,
                 paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
+REGISTER_LITE_OP(fake_quantize_abs_max,
+                 paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
--- a/lite/operators/fake_quantize_range_abs_max.h
+++ b/lite/operators/fake_quantize_range_abs_max.h
@@ -40,13 +40,15 @@ class FakeQuantizeRangeMaxAbsOpLite : public OpLite {

  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
    auto x = op_desc.Input("X").front();
-    auto in_scale = op_desc.Input("InScale").front();
+    if (op_desc.HasInput("InScale")) {
+      auto in_scale = op_desc.Input("InScale").front();
+      param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
+    }

    auto out = op_desc.Output("Out").front();
    auto out_scale = op_desc.Output("OutScale").front();

    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
-    param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();

    param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
    param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();

--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
@@ -37,15 +37,6 @@ class FcOpLite : public OpLite {

  bool InferShape() const override;

-  /*
-  bool Run() override {
-    CHECK(kernel_);
-    kernel_->Run();
-    return true;
-  }
-   */
-
-  // TODO(Superjomn) replace framework::OpDesc with a lite one.
  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }

--- a/lite/operators/mul_op.cc
+++ b/lite/operators/mul_op.cc
@@ -32,21 +32,6 @@ bool MulOpLite::CheckShape() const {
  CHECK_GT_OR_FALSE(x_dims.size(), static_cast<size_t>(param_.x_num_col_dims));
  CHECK_GT_OR_FALSE(y_dims.size(), static_cast<size_t>(param_.y_num_col_dims));

-  // #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  //   auto x_mat_dims =
-  //       framework::flatten_to_2d(x_dims.data(), param_.x_num_col_dims);
-  //   auto y_mat_dims =
-  //       framework::flatten_to_2d(y_dims.data(), param_.y_num_col_dims);
-
-  //   PADDLE_ENFORCE_EQ(x_mat_dims[1],
-  //                     y_mat_dims[0],
-  //                     "First matrix's width must be equal with second
-  //                     matrix's"
-  //                     "height. %s, %s",
-  //                     x_mat_dims[1],
-  //                     y_mat_dims[0]);
-  // #endif
-
  return true;
 }

@@ -73,49 +58,8 @@ bool MulOpLite::InferShape() const {
  return true;
 }

-#ifdef LITE_WITH_TRAIN
-bool MulGradOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.y);
-  CHECK_OR_FALSE(param_.output_grad);
-
-  return true;
-}
-
-bool MulGradOpLite::InferShape() const {
-  if (param_.x_grad) param_.x_grad->Resize(param_.x->dims());
-  if (param_.y_grad) param_.y_grad->Resize(param_.y->dims());
-  return true;
-}
-
-bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto X_name = op_desc.Input("X").front();
-  auto Y_name = op_desc.Input("Y").front();
-  auto Out_grad_name = op_desc.Input(framework::GradVarName("Out")).front();
-
-  if (op_desc.Output(framework::GradVarName("X")).size()) {
-    auto X_grad_name = op_desc.Output(framework::GradVarName("X")).front();
-    param_.x_grad = GetMutableVar<lite::Tensor>(scope, X_grad_name);
-  }
-
-  if (op_desc.Output(framework::GradVarName("Y")).size()) {
-    auto Y_grad_name = op_desc.Output(framework::GradVarName("Y")).front();
-    param_.y_grad = GetMutableVar<lite::Tensor>(scope, Y_grad_name);
-  }
-
-  param_.x = GetVar<lite::Tensor>(scope, X_name);
-  param_.y = GetVar<lite::Tensor>(scope, Y_name);
-  param_.output_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-
-  return true;
-}
-#endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle

 REGISTER_LITE_OP(mul, paddle::lite::operators::MulOpLite);
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite);
-#endif
--- a/lite/operators/reduce_ops.cc
+++ b/lite/operators/reduce_ops.cc
@@ -50,20 +50,18 @@ bool ReduceOp::InferShape() const {
  } else {
    size_t out_rank = keep_dim ? x_rank : x_rank - dims.size();
    std::vector<DDim::value_type> out_dims(out_rank);
-    if (keep_dim) {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        out_dims[dims[i]] = 1;
-      }
-    } else {
-      sort(dims.begin(), dims.end());
-      int dim_index = 0;
-      int out_index = 0;
-      for (size_t i = 0; i < x_rank; ++i) {
-        if (dims[dim_index] == static_cast<DDim::value_type>(i)) {
-          dim_index++;
-        } else {
-          out_dims[out_index++] = x_dims[i];
+    sort(dims.begin(), dims.end());
+    int dim_index = 0;
+    int out_index = 0;
+    for (size_t i = 0; i < x_rank; ++i) {
+      if (dim_index < dims.size() &&
+          dims[dim_index] == static_cast<DDim::value_type>(i)) {
+        if (keep_dim) {
+          out_dims[out_index++] = 1;
        }
+        dim_index++;
+      } else {
+        out_dims[out_index++] = x_dims[i];
      }
    }
    param_.output->Resize(out_dims);

--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -44,6 +44,7 @@ if(LITE_BUILD_EXTRA)
    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_sum_compute SRCS reduce_sum_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
@@ -192,7 +192,7 @@ class FcOPTest : public arena::TestCase {
    fill_data_rand(bin.data(), -1.f, 1.f, bdims_.production());

    SetCommonTensor(input_, dims_, din.data());
-    SetCommonTensor(weight_, wdims_, win.data());
+    SetCommonTensor(weight_, wdims_, win.data(), {}, true);
    if (padding_weights_) {
      std::vector<float> win_padding(wdims_padding_.production());
      for (int64_t i = 0; i < wdims_[0]; ++i) {
@@ -203,15 +203,15 @@ class FcOPTest : public arena::TestCase {
      SetCommonTensor(weight_padding_, wdims_padding_, win_padding.data());
    }
    if (flag_bias) {
-      SetCommonTensor(bias_, bdims_, bin.data());
+      SetCommonTensor(bias_, bdims_, bin.data(), {}, true);
    }
  }
 };

-void TestFCMain(Place place,
-                float abs_error,
-                bool with_relu = false,
-                bool padding = false) {
+void TestFC2D(Place place,
+              float abs_error,
+              bool with_relu = false,
+              bool padding = false) {
  for (auto& m : {1, 3, 16}) {
    for (auto& n : {1, 4, 16, 128, 256, 1024}) {
      for (auto& k : {1, 16, 128, 1024}) {
@@ -242,9 +242,35 @@ void TestFCMain(Place place,
  }
 }

+void TestFCHelper(Place place,
+                  float abs_error,
+                  std::vector<int64_t> xdims,
+                  std::vector<int64_t> wdims,
+                  std::vector<int64_t> bdims,
+                  int in_num_col_dims) {
+  std::unique_ptr<arena::TestCase> tester(new FcOPTest(place,
+                                                       "def",
+                                                       DDim(xdims),
+                                                       DDim(wdims),
+                                                       DDim(bdims),
+                                                       in_num_col_dims,
+                                                       false,
+                                                       false));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void TestFCnD(Place place, float abs_error) {
+  TestFCHelper(place, abs_error, {2, 3, 4}, {4, 5}, {5}, 2);
+  TestFCHelper(place, abs_error, {2, 3, 4}, {12, 5}, {5}, 1);
+  TestFCHelper(place, abs_error, {2, 3, 4, 5}, {5, 6}, {6}, 3);
+  TestFCHelper(place, abs_error, {2, 3, 4, 5}, {20, 6}, {6}, 2);
+  TestFCHelper(place, abs_error, {2, 3, 4, 5}, {60, 6}, {6}, 1);
+}
+
 TEST(FcOP, precision) {
  Place place;
-  float abs_error = 6e-5;
+  float abs_error = 1e-4;
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 2e-1;  // Using fp16 in NPU
@@ -256,7 +282,9 @@ TEST(FcOP, precision) {
 #else
  return;
 #endif
-  TestFCMain(place, abs_error);
+
+  TestFC2D(place, abs_error);
+  TestFCnD(place, abs_error);
 }

 #ifdef LITE_WITH_X86
@@ -264,7 +292,7 @@ TEST(FcOP, padding_and_parallel) {
  Place place(TARGET(kX86));
  float abs_error = 1e-4;
  x86::SetNumThreads(4);
-  TestFCMain(place, abs_error, true, true);
+  TestFC2D(place, abs_error, true, true);
 }
 #endif


--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
@@ -99,7 +99,7 @@ class MulComputeTester : public arena::TestCase {

    std::vector<float> y(y_dims_.production());
    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
-    SetCommonTensor(y_, y_dims_, y.data());
+    SetCommonTensor(y_, y_dims_, y.data(), {}, true);
  }
 };

@@ -123,7 +123,10 @@ TEST(Mul, precision) {
  LOG(INFO) << "test mul op";
  float abs_error = 2e-5;
  Place place;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_XPU)
  place = TARGET(kXPU);
 #else
  return;

--- a/lite/tests/kernels/reduce_sum_compute_test.cc
+++ b/lite/tests/kernels/reduce_sum_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+void reduce_sum_n(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_c(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_h(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = 0.0;
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_w(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = 0.0;
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_all(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  float sum = 0.0;
+  int src_index;
+  int n_id, c_id;
+  for (int n = 0; n < num_in; ++n) {
+    n_id = n * channel_in * height_in * width_in;
+    for (int c = 0; c < channel_in; ++c) {
+      c_id = c * height_in * width_in;
+      for (int h = 0; h < height_in; ++h) {
+        for (int w = 0; w < width_in; ++w) {
+          src_index = n_id + c_id + h * width_in + w;
+          sum = sum + src[src_index];
+        }
+      }
+    }
+  }
+  dst[0] = sum;
+}
+
+void reduce_sum_nc(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_sum_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_sum_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+void reduce_sum_ch(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_sum_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_sum_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+void reduce_sum_hw(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_sum_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_sum_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+class ReduceSumComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::vector<int> dim_{0};
+  bool keep_dim_ = false;
+  bool reduce_all_ = false;
+  DDim x_dims_{{3, 2, 3, 4}};
+
+ public:
+  ReduceSumComputeTester(const Place& place,
+                         const std::string& alias,
+                         std::vector<int> dim,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DDim x_dims)
+      : TestCase(place, alias),
+        dim_(dim),
+        keep_dim_(keep_dim),
+        reduce_all_(reduce_all),
+        x_dims_(x_dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindMutableTensor(input_);
+    const auto* x_data = x->data<float>();
+    auto* out = scope->NewTensor(output_);
+    auto x_rank = x_dims_.size();
+    if (!dim_.empty()) {
+      for (int i = 0; i < dim_.size(); i++) {
+        if (dim_[i] < 0) {
+          dim_[i] += x_rank;
+        }
+      }
+    }
+
+    sort(dim_.begin(), dim_.end());
+    std::vector<int64_t> out_dims;
+    if (reduce_all_) {
+      if (keep_dim_) {
+        out_dims.resize(x_rank);
+        for (int i = 0; i < x_rank; ++i) {
+          out_dims[i] = 1;
+        }
+      } else {
+        out_dims.push_back(1);
+      }
+    } else {
+      for (int i = 0; i < x_dims_.size(); i++) {
+        out_dims.push_back(x_dims_[i]);
+      }
+      if (keep_dim_) {
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = 1L;
+        }
+      } else {
+        int64_t kDelFlag = -2;
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = kDelFlag;
+        }
+        out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                       out_dims.end());
+      }
+    }
+    out->Resize(DDim(out_dims));
+
+    auto* out_data = out->mutable_data<float>();
+    int in_n = x_dims_[0];
+    int in_c = x_dims_[1];
+    int in_h = x_dims_[2];
+    int in_w = x_dims_[3];
+
+    if (reduce_all_) {
+      reduce_sum_all(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim_.size() == 1) {
+      switch (dim_[0]) {
+        case 0:
+          reduce_sum_n(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        case 1:
+          reduce_sum_c(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        case 2:
+          reduce_sum_h(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        case 3:
+          reduce_sum_w(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        default:
+          LOG(FATAL) << "error!!!";
+      }
+    } else if (dim_.size() == 2) {
+      if (dim_[0] == 0 && dim_[1] == 1) {
+        reduce_sum_nc(x_data, out_data, in_n, in_c, in_h, in_w);
+      } else if (dim_[0] == 1 && dim_[1] == 2) {
+        reduce_sum_ch(x_data, out_data, in_n, in_c, in_h, in_w);
+      } else if (dim_[0] == 2 && dim_[1] == 3) {
+        reduce_sum_hw(x_data, out_data, in_n, in_c, in_h, in_w);
+      } else {
+        LOG(FATAL) << "invalid dims_!!";
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("reduce_sum");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+    op_desc->SetAttr("dim", dim_);
+    op_desc->SetAttr("keep_dim", keep_dim_);
+    op_desc->SetAttr("reduce_all", reduce_all_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      data[i] = i * 1.0;
+    }
+    SetCommonTensor(input_, x_dims_, data.data());
+  }
+};
+
+void test_reduce_sum(Place place) {
+  std::vector<std::vector<int>> reduce_dim{
+      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 2}) {
+      for (auto h : {1, 3}) {
+        for (auto w : {1, 3}) {
+          for (bool keep_dim : {false, true}) {
+            for (bool reduce_all : {false, true}) {
+              for (auto dim : reduce_dim) {
+                auto x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
+                std::unique_ptr<arena::TestCase> tester(
+                    new ReduceSumComputeTester(
+                        place, "def", dim, keep_dim, reduce_all, x_dims));
+                arena::Arena arena(std::move(tester), place, 2e-5);
+                arena.TestPrecision();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(ReduceSum, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  test_reduce_sum(place);
+#endif
+  // #ifdef LITE_WITH_ARM
+  //  Place place(TARGET(kARM));
+  //  test_reduce_sum(place);
+  // #endif
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
@@ -21,19 +21,40 @@
 namespace paddle {
 namespace lite {

-int data_index(std::vector<int> pos, DDimLite dims) {
-  int d1 = dims[1];
-  int d2 = dims[2];
-  int d3 = dims[3];
-  return pos[0] * d1 * d2 * d3 + pos[1] * d2 * d3 + pos[2] * d3 + pos[3];
+std::vector<int> CalStrides(const DDim& dims) {
+  int dsize = dims.size();
+  std::vector<int> strides(dsize, 1);
+  for (int i = dsize - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * dims[i + 1];
+  }
+  return strides;
+}
+
+std::vector<int> CalIndex(const std::vector<int>& strides, int offset) {
+  int dsize = strides.size();
+  std::vector<int> index(dsize, 0);
+  for (int i = 0; i < dsize; i++) {
+    index[i] = offset / strides[i];
+    offset %= strides[i];
+  }
+  return index;
 }

-std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
-  std::vector<int> out_pos(in_pos.size());
+std::vector<int> TransIndex(const std::vector<int>& in_index,
+                            const std::vector<int>& axis) {
+  std::vector<int> out_index(in_index.size(), 0);
  for (int i = 0; i < axis.size(); i++) {
-    out_pos[i] = in_pos[axis[i]];
+    out_index[i] = in_index[axis[i]];
+  }
+  return out_index;
+}
+
+int CalOffset(const std::vector<int>& strides, const std::vector<int>& index) {
+  int offset = 0;
+  for (int i = 0; i < index.size(); i++) {
+    offset += strides[i] * index[i];
  }
-  return out_pos;
+  return offset;
 }

 class TransposeComputeTester : public arena::TestCase {
@@ -64,29 +85,19 @@ class TransposeComputeTester : public arena::TestCase {
      out_shape[i] = dims_[axis_[i]];
    }
    out->Resize(out_shape);
+    auto out_dims = out->dims();
+
+    std::vector<int> x_strides = CalStrides(dims_);
+    std::vector<int> out_strides = CalStrides(out_dims);
+
+    auto x_data = x->data<float>();
+    auto out_data = out->mutable_data<float>();

-    auto y_dims = out->dims();
-
-    int input_n = dims_[0];
-    int input_c = dims_[1];
-    int input_h = dims_[2];
-    int input_w = dims_[3];
-
-    auto input_data = x->data<float>();
-    auto output_data = out->mutable_data<float>();
-
-    for (int n = 0; n < input_n; ++n) {
-      for (int c = 0; c < input_c; ++c) {
-        for (int h = 0; h < input_h; ++h) {
-          for (int w = 0; w < input_w; ++w) {
-            std::vector<int> in_pos{n, c, h, w};
-            std::vector<int> out_pos = pos_trans(in_pos, axis_);
-            int in_index = data_index(in_pos, dims_);
-            int out_index = data_index(out_pos, y_dims);
-            output_data[out_index] = input_data[in_index];
-          }
-        }
-      }
+    for (int i = 0; i < dims_.production(); i++) {
+      std::vector<int> x_index = CalIndex(x_strides, i);
+      std::vector<int> out_index = TransIndex(x_index, axis_);
+      int out_offset = CalOffset(out_strides, out_index);
+      out_data[out_offset] = x_data[i];
    }

    if (op_type_ == "transpose2") {
@@ -114,6 +125,41 @@ class TransposeComputeTester : public arena::TestCase {
  }
 };

+void TestTranspose2D(Place place, float abs_error) {
+  DDim x_dims{{4, 5}};
+  std::vector<std::vector<int>> axes{{0, 1}, {1, 0}};
+  for (auto axis : axes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new TransposeComputeTester(place, "def", x_dims, axis));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
+void TestTranspose3D(Place place, float abs_error) {
+  DDim x_dims{{3, 4, 5}};
+  std::vector<std::vector<int>> axes{
+      {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {2, 1, 0}};
+  for (auto axis : axes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new TransposeComputeTester(place, "def", x_dims, axis));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
+void TestTranspose4D(Place place, float abs_error) {
+  DDim x_dims{{2, 3, 4, 5}};
+  std::vector<std::vector<int>> axes{
+      {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}, {3, 1, 0, 2}};
+  for (auto axis : axes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new TransposeComputeTester(place, "def", x_dims, axis));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
 TEST(Transpose, precision) {
  LOG(INFO) << "test Transpose op";
  float abs_error = 2e-5;
@@ -127,15 +173,9 @@ TEST(Transpose, precision) {
  return;
 #endif

-  DDim x_dims{{2, 3, 4, 5}};
-  std::vector<std::vector<int>> axes{
-      {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}, {3, 1, 0, 2}};
-  for (auto axis : axes) {
-    std::unique_ptr<arena::TestCase> tester(
-        new TransposeComputeTester(place, "def", x_dims, axis));
-    arena::Arena arena(std::move(tester), place, abs_error);
-    arena.TestPrecision({"xshape"});
-  }
+  TestTranspose2D(place, abs_error);
+  TestTranspose3D(place, abs_error);
+  TestTranspose4D(place, abs_error);
 }

 }  // namespace lite

--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -307,7 +307,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
 #endif  // LITE_WITH_ARM

 // TODO(chenjiaoAngel): fix multi-threds, diff: 3x3 depthwise conv
-#if 1  /// 3x3dw
+#if 1  // 3x3dw
 TEST(TestConv3x3DW, test_conv3x3_depthwise) {
  if (FLAGS_basic_test) {
    for (auto& stride : {1, 2}) {
@@ -449,7 +449,7 @@ TEST(TestConv3x3s1, test_conv_3x3s1) {
                        dims.push_back(DDim({batch, cin, h, h}));
                      }
                    }
-                    if (cin == 1 && cout ==1) {
+                    if (cin == 1 && cout == 1) {
                      continue;
                    }
                    const float leakey_relu_scale = 8.88;

--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -457,7 +457,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
                    const std::vector<int>& power_mode) {}
 #endif  // LITE_WITH_ARM

-#if 1  /// 3x3dw
+#if 0   /// 3x3dw
 TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
  if (FLAGS_basic_test) {
    for (auto& stride : {1, 2}) {
@@ -494,7 +494,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
 #if 1  /// 5x5dw
 TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
  if (FLAGS_basic_test) {
-    for (auto& stride : {1}) {
+    for (auto& stride : {1, 2}) {
      for (auto& pad : {0, 1, 2, 3, 4}) {
        for (auto& flag_bias : {false, true}) {
          for (auto& flag_relu : {false, true}) {
@@ -525,7 +525,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
 }
 #endif  /// 5x5dw

-#if 1  /// conv1x1s1
+#if 0   /// conv1x1s1
 TEST(TestConv1x1s1Int8, test_conv1x1s1) {
  if (FLAGS_basic_test) {
    for (auto& cin : {1, 3, 8, 32}) {
@@ -562,7 +562,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
 }
 #endif  /// conv1x1s1

-#if 1  /// conv3x3s1
+#if 0   /// conv3x3s1
 TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
  if (FLAGS_basic_test) {
    for (auto& cin : {1, 3, 8, 33}) {
@@ -602,7 +602,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
 }
 #endif  /// conv3x3s1

-#if 1  /// conv3x3s2
+#if 0   /// conv3x3s2
 TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
  if (FLAGS_basic_test) {
    for (auto& cin : {1, 3, 31}) {
@@ -642,7 +642,7 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
 }
 #endif  /// conv3x3s2

-#if 1  /// random param conv
+#if 0   /// random param conv
 TEST(TestConvRandInt8, test_conv_rand) {
  if (FLAGS_basic_test) {
    for (auto& cin : {1, 17}) {

--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
@@ -285,7 +285,7 @@ TEST(TestLiteGemvInt8, gemv_prepacked_int8) {
    paddle::lite::DeviceInfo::Init();
 #endif
    LOG(INFO) << "run basic sgemm test";
-    for (auto& m : {1, 3, 8, 32, 397}) {
+    for (auto& m : {1, 3, 8, 32}) {  // ,397
      for (auto& n : {1, 3, 13, 141, 512, 789}) {
        for (auto& tra : {false}) {
          for (auto& has_bias : {false, true}) {

--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -14,7 +14,7 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}

 # global variables
 BUILD_EXTRA=OFF
-BUILD_JAVA=ON
+BUILD_JAVA=OFF
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)
 OPTMODEL_DIR=""
@@ -62,17 +62,17 @@ function prepare_thirdparty {
    fi
 }

-function build_model_optimize_tool {
+function build_opt {
    cd $workspace
    prepare_thirdparty
-    mkdir -p build.model_optimize_tool
-    cd build.model_optimize_tool
+    mkdir -p build.opt
+    cd build.opt
    cmake .. -DWITH_LITE=ON \
      -DLITE_ON_MODEL_OPTIMIZE_TOOL=ON \
      -DWITH_TESTING=OFF \
      -DLITE_BUILD_EXTRA=ON \
      -DWITH_MKL=OFF
-    make model_optimize_tool -j$NUM_PROC
+    make opt -j$NUM_PROC
 }

 function make_tiny_publish_so {
@@ -395,7 +395,7 @@ function main {
                shift
                ;;
            build_optimize_tool)
-                build_model_optimize_tool
+                build_opt
                shift
                ;;
            cuda)

--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -519,7 +519,7 @@ function test_model_optimize_tool_compile {
    cd $workspace
    cd build
    cmake .. -DWITH_LITE=ON -DLITE_ON_MODEL_OPTIMIZE_TOOL=ON -DWITH_TESTING=OFF -DLITE_BUILD_EXTRA=ON
-    make model_optimize_tool -j$NUM_CORES_FOR_COMPILE
+    make opt -j$NUM_CORES_FOR_COMPILE
 }

 function _test_paddle_code_generator {