diff --git a/docs/advanced_user_guides/add_layout.md b/docs/advanced_user_guides/add_layout.md
new file mode 100644
index 0000000000000000000000000000000000000000..11e504f93c2b1bcaefaa06c0a5f51aea0995884e
--- /dev/null
+++ b/docs/advanced_user_guides/add_layout.md
@@ -0,0 +1,184 @@
+# 如何增加Layout
+
+Paddle-Lite中Place包含了Target、Layout、Precision信息，用来注册和选择模型中的具体Kernel。下面以增加Place中的layout：`ImageDefault`、`ImageFolder`、`ImageNW`为例，讲解如何增加新Layout。
+
+根据在`lite/core/`、`lite/api`目录下以`NHWC`为关键词检索代码，发现需要分别在以下的文件中加入Layout内容：
+
+1. lite/api/paddle_place.h
+2. lite/api/paddle_place.cc
+3. lite/api/python/pybind/pybind.cc
+4. lite/core/op_registry.h
+5. lite/core/op_registry.cc
+
+## 1. lite/api/paddle_place.h
+
+在`enum class DataLayoutType`中加入对应的Layout，注意已有的Layout不能改变值，增加新Layout递增即可：
+
+```cpp
+enum class DataLayoutType : int {
+  kUnk = 0,
+  kNCHW = 1,
+  kNHWC = 3,
+  kImageDefault = 4,  // for opencl image2d
+  kImageFolder = 5,   // for opencl image2d
+  kImageNW = 6,       // for opencl image2d
+  kAny = 2,           // any data layout
+  NUM = 7,            // number of fields.
+};
+```
+
+## 2. lite/api/paddle_place.cc
+
+本文件有3处修改，注意在` DataLayoutToStr`函数中加入对应Layout的字符串名，顺序为`lite/api/paddle_place.h`中枚举值的顺序：
+
+```cpp
+// 该文件第1处
+const std::string& DataLayoutToStr(DataLayoutType layout) {
+  static const std::string datalayout2string[] = {
+      "unk", "NCHW", "any", "NHWC", "ImageDefault", "ImageFolder", "ImageNW"};
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
+}
+
+// 该文件第2处
+const std::string& DataLayoutRepr(DataLayoutType layout) {
+  static const std::string datalayout2string[] = {"kUnk",
+                                                  "kNCHW",
+                                                  "kAny",
+                                                  "kNHWC",
+                                                  "kImageDefault",
+                                                  "kImageFolder",
+                                                  "kImageNW"};
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
+}
+
+// 该文件第3处
+std::set<DataLayoutType> ExpandValidLayouts(DataLayoutType layout) {
+  static const std::set<DataLayoutType> valid_set({DATALAYOUT(kNCHW),
+                                                   DATALAYOUT(kAny),
+                                                   DATALAYOUT(kNHWC),
+                                                   DATALAYOUT(kImageDefault),
+                                                   DATALAYOUT(kImageFolder),
+                                                   DATALAYOUT(kImageNW)});
+  if (layout == DATALAYOUT(kAny)) {
+    return valid_set;
+  }
+  return std::set<DataLayoutType>({layout});
+}
+```
+
+## 3. lite/api/python/pybind/pybind.cc
+
+```cpp
+  // DataLayoutType
+  py::enum_<DataLayoutType>(*m, "DataLayoutType")
+      .value("NCHW", DataLayoutType::kNCHW)
+      .value("NHWC", DataLayoutType::kNHWC)
+      .value("ImageDefault", DataLayoutType::kImageDefault)
+      .value("ImageFolder", DataLayoutType::kImageFolder)
+      .value("ImageNW", DataLayoutType::kImageNW)
+      .value("Any", DataLayoutType::kAny);
+```
+
+## 4. lite/core/op_registry.h
+
+找到KernelRegister final中的`using any_kernel_registor_t =`，加入下面修改信息：
+
+```cpp
+// 找到KernelRegister final中的`using any_kernel_registor_t =`
+// 加入如下内容：
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageNW)> *,  //
+```
+
+
+## 5. lite/core/op_registry.cc
+
+该文件有2处修改：
+
+```cpp
+// 该文件第1处
+#define CREATE_KERNEL1(target__, precision__)                                \
+  switch (layout) {                                                          \
+    case DATALAYOUT(kNCHW):                                                  \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kNCHW)>(op_type);                             \
+    case DATALAYOUT(kAny):                                                   \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kAny)>(op_type);                              \
+    case DATALAYOUT(kNHWC):                                                  \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kNHWC)>(op_type);                             \
+    case DATALAYOUT(kImageDefault):                                          \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageDefault)>(op_type);                     \
+    case DATALAYOUT(kImageFolder):                                           \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageFolder)>(op_type);                      \
+    case DATALAYOUT(kImageNW):                                               \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageNW)>(op_type);                          \
+    default:                                                                 \
+      LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \
+  }
+
+// 该文件第2处
+// 找到文件中的下面的函数
+KernelRegistry::KernelRegistry()
+    : registries_(static_cast<int>(TARGET(NUM)) *
+                  static_cast<int>(PRECISION(NUM)) *
+                  static_cast<int>(DATALAYOUT(NUM)))
+
+// 在该函数中加入新增Layout的下面内容
+  INIT_FOR(kOpenCL, kFP16, kNCHW);
+  INIT_FOR(kOpenCL, kFP16, kNHWC);
+  INIT_FOR(kOpenCL, kFP16, kImageDefault);
+  INIT_FOR(kOpenCL, kFP16, kImageFolder);
+  INIT_FOR(kOpenCL, kFP16, kImageNW);
+  INIT_FOR(kOpenCL, kFloat, kImageDefault);
+  INIT_FOR(kOpenCL, kFloat, kImageFolder);
+  INIT_FOR(kOpenCL, kFloat, kImageNW);
+  INIT_FOR(kOpenCL, kAny, kImageDefault);
+  INIT_FOR(kOpenCL, kAny, kImageFolder);
+  INIT_FOR(kOpenCL, kAny, kImageNW);
+```
diff --git a/docs/advanced_user_guides/add_new_pass.md b/docs/advanced_user_guides/add_new_pass.md
new file mode 100644
index 0000000000000000000000000000000000000000..93b27cd038642c702cd213adffcc378dc852a1b3
--- /dev/null
+++ b/docs/advanced_user_guides/add_new_pass.md
@@ -0,0 +1,437 @@
+
+# 新增Pass方法
+
+本文从三个方面介绍了`Lite`中的`Pass`结构：**Pass是什么**、**Pass的实现与接口**、**Pass的一般注册流程**。最后以`Fc_fuse_pass`为例介绍了`fusion_pass`的作用与注册方法。
+
+## 前述：Pass是什么？
+
+**CxxPredictor加载模型后，在执行预测前会先优化模型。模型优化过程是通过Pass实现的。**
+具体调用关系如下：
+![图片](https://user-images.githubusercontent.com/45189361/69638690-20d21880-1096-11ea-8169-1d2c7e1a1609.png)
+
+ - `CreatePredictor(CxxConfig)`函数调用了Predictor->Build(CxxConfig)
+   - CxxPredictor的构建过程（Build）分为两步：
+     - Predictor->LoadModel()          加载模型文件到program中
+     - Predicotr->optimizer_.Run()    对Program中的原始图形结构进行优化
+          - 对图结构的优化是通过调用 `Pass->Apply(const std::unique_ptr<SSAGraph>& graph)`方法实现的。
+
+
+**每一类Pass定义了一种优化过程**，包括：原模型中的kernel选取、OP融合、冗余OP去除、子图创建、内存优化、类型推导、类型转换等。
+
+
+
+
+## Pass的实现与接口 ：Pass基类、PassManager和Pass注册
+
+### 1、Pass基类：`paddle::lite::mir::Pass`
+```c++
+class Pass {
+ public:
+  // Pass的类型，Pass按照作用的不同可以分为三种
+  enum class Kind {   //种类的作用不太清楚
+    // 1. 修改模型中的图拓扑结构的Pass
+    kProgramWise = 0,
+    // 2. 不修改图结构，修改状态的Pass
+    kStmtWise,     
+    // 3. 不修改 IR，用于搜集信息和可视化信息的Pass.
+    kDebug,
+  };
+  
+  // 主要实现函数：Apply 函数定义了 Pass 运行时执行的操作
+  virtual void Apply(const std::unique_ptr<SSAGraph>& graph) = 0;
+
+  bool is_program_pass() const { return kind_ == Kind::kProgramWise; }
+  bool is_stmt_pass() const { return kind_ == Kind::kStmtWise; }
+
+  virtual ~Pass() = default;
+
+ private:
+  const Kind kind_;  // pass 的种类
+  std::string name_; // pass 的名称
+  std::set<TargetType> bound_targets_; // 指定了Pass运行的硬件平台，模型优化过程会根据当前硬件平台是否匹配筛选Pass。
+  std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_; // 绑定的kernel
+};
+
+
+// Different kinds.
+class ProgramPass : public Pass {
+ public:
+  ProgramPass() : Pass(Kind::kProgramWise) {}
+};
+class StmtPass : public Pass {
+ public:
+  StmtPass() : Pass(Kind::kStmtWise) {}
+};
+
+class DebugPass : public Pass {
+ public:
+  DebugPass() : Pass(Kind::kDebug) {}
+};
+```
+**代码位置**：`lite/core/mir/pass.h`
+**主要类成员**：
+  `const Kind kind_` : Pass类型。pass 有三种基本基本类型 ：修改图结构的`ProgramPass`、修改状态量的`StmtPass`和Debug过程采集信息与控制可视化的`DebugPass`。  
+  `std::string name_` ：pass 的名称
+  `std::set<TargetType> bound_targets_` : Pass运行的硬件平台，optimizer.Run()优化过程会根据硬件平台选择匹配的Pass。------根据硬件平台自动选择需要的pass
+  `std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_` : Pass 绑定的kernel   (what's this used for)
+**主要接口**： 
+  `Pass::Apply(const std::unique_ptr& graph)` : Pass优化过程的具体操作，是新注册Pass需要实现的接口。输入为`SSAGraph`型指针，是对模型结构的拓扑表示。
+
+### 2、Pass管理 `paddle::lite::mir::PassManager` 
+
+```c++
+class PassManager {
+ public:
+  // 内部静态变量PassManager，用来存储使用的Pass和图优化操作
+  static PassManager& Global() {
+    static PassManager x;
+    return x;
+  }
+ 
+ // 执行所有的 Pass 
+ void Run(const std::unique_ptr<SSAGraph>& graph) {
+    for (auto& pass : passes_) {
+      LOG(INFO) << "Running MIR pass " << pass->name();
+      pass->Apply(graph);
+    }
+
+ private:
+  std::list<std::unique_ptr> passes_;  //存储所有的 Pass
+  std::map<std::string, mir::Pass*> pass_map_;    //使用map变量存储 PassName::Pass
+  
+ }
+
+```
+**代码位置**：`lite/core/mir/pass_manager.h`
+**主要类成员**：
+`std::list:unique_ptr> passes_;`  : List类型，存储了所有已注册Pass。
+`std::map<std::string, mir::Pass*> pass_map_; `  :   Map类型，存储了所有"Pass名称-Pass类"键对，用于根据名称查找Pass。
+
+**主要接口**：
+ `static PassManager& Global()` 返回PassManager全局静态变量,该变量存储了所有已注册的Pass
+` bool AddNewPass(const std::string& name, Pass* pass)` 添加新的Pass到PassManager中
+
+
+### 3、 Pass 注册 `paddle::lite::mir::PassRegistry`
+**代码位置**：`lite/core/mir/pass_registry.h`
+**主要接口**：
+`REGISTER_MIR_PASS(name__, class__)` ：宏定义函数，用于注册Pass。注册Pass过程实现的是 `PassManager::Global().AddNewPass(name__, class__)`，将新注册Pass添加到全局变量`PassManager`中。
+
+
+
+## Pass的一般注册流程与使用方法
+
+### 1. Pass 注册流程
+在`lite/core/mir`或其子目录下继承`Pass基类`，实现`Pass::Apply`接口，并使用宏`REGISTER_MIR_PASS(name__, class__)`将Pass注册到`PassManager`即完成了新Pass注册。
+
+**以新建 **`new_demo_pass`**为例**，具体流程如下：
+（1）在`lite/core/mir`路径下新建`example_pass.cc` 和 `new_demo_pass.h` 文件
+（2）在`example_pass.h` 文件中继承Pass基类（ProgramPass、StmtPass或DebugPass）定义自己的Pass类。
+```c++
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+class ExamplePass : public ProgramPass {
+  void Apply(const std::unique_ptr<SSAGraph> &graph) override {}
+   ...
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+```
+（3）在`example_pass.cc` 文件中实现`ExamplePass::Apply()`接口，并注册`ExamplePass`
+```c++
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/example_pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+void ExamplePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+    ...
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(example_pass, paddle::lite::mir::ExamplePass)
+    .BindTargets({TARGET(kARM)}); // Pass执行的目标硬件平台
+    // .BindKernel("conv2d");     //Pass绑定的 kernel
+```
+
+（4）修改`lite/core/mir/CMakeLists.txt`文件，将`example_pass.cc` 编译到`mir_passes`库中
+
+```cmake
+lite_cc_library(mir_passes
+  SRCS
+      demo_pass.cc  // 新建的Pass文件
+      ...
+      memory_optimize_pass.cc
+  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
+```
+### 2. Pass使用流程
+
+将Pass注册到PassManager后不会自动生效。需要在`optimizer->run()` 函数中添加该Pass才会在模型优化过程中调用。
+（1）在`paddle_use_passes.h`文件中调用该Pass
+
+```cmake
+#include "paddle_lite_factory_helper.h"  // NOLINT
+    ...
+USE_MIR_PASS(new_demo_pass);  //调用 new_demo_pass
+```
+（2）要想在优化模型时调用该Pass，需要在`optimizer->run()`函数中手动添加调用。
+
+修改`lite/core/optimizer.h`文件，添加`new_demo_pass`到`Optimizer::Run()`函数；
+```c++
+ class Optimizer {
+ public:
+  void Run(...) {
+   ...
+    if (passes.empty()) {
+      RunPasses(std::vector<std::string>{
+          {"new_demo_pass"     //将新注册的Pass添加在这里
+             ...
+           }
+    ...
+ }      
+```
+（3）只有CxxPredictor才会在模型加载后根据Pass优化模型。
+```c++
+ ...
+#include "paddle_use_passes.h"   // 引用Pass优化模型
+void RunModel() {
+  // 1. 创建 CxxConfig
+  CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places(Place{TARGET(kARM), PRECISION(kFloat)});
+
+  // 2. 创建CxxPredictor,该过程包括加载模型和用Pass优化模型
+  std::shared_ptr> predictor =
+      Creat<CxxConfig>(config);
+}
+```
+
+
+
+
+## Fusion Pass的定义与注册
+
+`Fusion Pass`是一种常见图结构优化Pass，可将多个连续OP融合成单个等效OP，减少数据交换并简化图结构。Pass运行时调用`Fuser`自动查找并替换指定图结构，所以注册`FuserPass`时还需要实现对应的Fuser类。
+
+下面以`fc_fuse_pass`为例，详细说明`FusionPass`的效果和注册方法。
+
+### `fc_fuse_pass`的作用
+将相邻的`mul`算子和 `element_wise add `算子 融合成一个 `FC`  算子
+```c++
+mul(X) =  X * W 
+elementwise_add( mul(x) ) = X * W + Bias
+//----------> after fusion
+FC(X) = X * W +Bias
+```
+
+Pass 运行效果如下：
+![图片](https://user-images.githubusercontent.com/45189361/69639193-12383100-1097-11ea-9063-21f030414080.png)
+mul和elementwise_add的原有参数映射到FC的参数上：
+![图片](https://user-images.githubusercontent.com/45189361/69638836-74446680-1096-11ea-9cdc-a961fa995dfe.png)
+
+### `fc_fuse_pass`的注册方法
+#### 1、创建FcFuser
+（1）在`lite/core/mir/fusion`路径下新建`fc_fuser.cc` 和 `fc_fuser.h` 文件
+（2）在`fc_fuser.h` 文件中继承`FuseBase`定义自己的Fuser类。
+
+```c++
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class FcFuser : public FuseBase {
+ public:
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+```
+**主要接口**：
+`FuseBase::BuildPattern` ：  描述需要替换位置的图结构（pattern），Fuser运行时会自动查找并替换该pattern。
+`FuseBase::GenOpDesc` ：       创建融合后的等效Fused_op。
+`FuseBase::InsertNewNode` ：用Fused_op替换原始图结构（pattern）。
+
+对于 `FcFuser`：BuildPattern描述的Pattern是`mul+elementwise add`，GenOpDesc创建的FC_op，InsertNewNode函数的效果是用新建的`FC_op`替换模型中的`mul+elementwise add` pattern。
+
+
+（3） 在`fc_fuser.cc`文件中实现 `BuildPattern()` 、`GenOpDesc()`、`InsertNewNode() `接口
+
+下面以FcFuser为例介绍三种接口的实现：
+
+```c++
+// 1. BuildPattern函数，描述需要替换的图结构
+// FcFuser::BuildPattern() 描述了 mul + element_wise add 图结构
+void FcFuser::BuildPattern() {
+  // （1） 用OpNode描述和VarNode
+  // mul OP
+  auto* mul = OpNode("mul", "mul");
+  // mul OP 的输入和输出
+  auto* x = VarNode("x")->assert_is_op_input("mul", "X");
+  auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
+  auto* mul_out = VarNode("mul_out");
+  
+  // elementwise_add OP
+  auto* add = OpNode("add", "elementwise_add");
+  //elementwise_add 的输入
+  auto* b = VarNode("b")->assert_is_persistable_var();
+  // elementwise_add OP的输出（最终输出）
+  auto* Out = VarNode("Out");
+
+  //（2） 描述拓扑连接 （Fuse之前mul 和elementwise_add的连接）
+  std::vector<PMNode*> mul_inputs{W, x};
+  std::vector<PMNode*> add_inputs{mul_out, b};
+  mul_inputs >> *mul >> *mul_out;
+  add_inputs >> *add >> *Out;
+ 
+
+  //（3） 声明新的拓扑结构中将会被移除的节点，包括被fuse的OP和OP之间的中间变量
+  mul_out->AsIntermediate();
+  mul->AsIntermediate();
+  add->AsIntermediate();
+}
+
+
+// 2. GenOpDesc函数新建等效 Fused_op
+// FcFuser::GenOpDesc() 新建了Fc_op
+cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
+  // (1) 得到第一个OP节点的 OpDesc ，并清空输入输出信息
+  cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
+  op_desc.mutable_inputs()->clear();
+  op_desc.mutable_outputs()->clear();
+  // (2) 修改OpDesc , 将OpType设置为 "fc" (FC OP 的OP_type)，
+  op_desc.SetType("fc");
+  // (3) 设置OpDesc中的Input、Output、Attrbute。分别连接到BuildPattern（）函数中创建的VarNode
+  op_desc.SetInput("Input", {matched.at("x")->arg()->name});
+  op_desc.SetInput("W", {matched.at("W")->arg()->name});
+  op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
+  op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
+  op_desc.SetAttr(
+      "in_num_col_dims",
+      matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+  return op_desc;
+}
+
+// 3. InsertNewNode函数用Fused OP 替换模型图中的原始 Pattern
+// FcFuser::InsertNewNode() 用Fc_OP替换原始模型图中的  " mul + element_wise add "
+void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
+  // (1) 创建FC OP的参数（OpDesc）
+  auto op_desc = GenOpDesc(matched);
+  // 创建一个 FC OP
+  auto fc_op = LiteOpRegistry::Global().Create("fc");
+  
+  // 找到原拓扑结构中的scope (作用域)和 valid_places （可支持设备类型）
+  auto mul = matched.at("mul")->stmt()->op();
+  auto* scope = mul->scope();
+  auto& valid_places = mul->valid_places();
+  
+  // (2) 将 FC OP的 scope和 valid_places设置与fuse前相同，并在图中创建该节点（node）
+  fc_op->Attach(op_desc, scope);
+  auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
+  
+  // (3) 将FC节点连接到输入输出（var_node）
+  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("b"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
+}
+```
+
+#### 2、注册fc_fuse_pass
+
+（1）在`lite/core/mir/fusion`路径下新建`fc_fuse_pass.cc` 和 `fc_fuse_pass.h` 文件
+（2）在`fc_fuse_pass.h` 文件中，继承`ProgramPass`定义`FcFusePass`。
+
+```c++
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+class FcFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override; namespace mir namespace lite namespace paddle
+```
+（3）在`fc_fuse_pass.cc` 文件中实现`FcFusePass::Apply()`接口，并注册`FcFusePass`
+```c++
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/example_pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  fusion::FcFuser fuser;
+  fuser(graph.get());namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
+    .BindTargets({TARGET(kAny)})  // FcFusePass 可以在任何硬件平台执行
+    .BindKernel("fc");            // FcFusePass 绑定 fc_kernel
+```
+
+（4）修改`lite/core/mir/fusion/CMakeLists.txt`文件，将`fc_fuser.cc` 编译到`mir_fusers`库
+
+```cmake
+lite_cc_library(fuse_fc
+        SRCS fc_fuser.cc
+        DEPS pattern_matcher_high_api) 
+
+set(mir_fusers
+    fuse_fc
+     ... 
+    CACHE INTERNAL "fusers")
+```
+
+（5）修改`lite/core/mir/CMakeLists.txt`文件，将`fc_fuse_pass.cc` 编译到`mir_pass`库
+```cmake
+lite_cc_library(mir_passes
+  SRCS
+      fusion/fc_fuse_pass.cc
+       ...
+  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
+```
+
+#### 3、使用 fc_fuse_pass
+
+（1） `lite/api/paddle_use_passes.h`使用`USE_LITE_PASS`宏来引入新加入的pass
+
+```c++
+USE_MIR_PASS(lite_fc_fuse_pass);
+```
+（2）  在`lite/core/optimizer.h`文件的`Optimizer::Run()`函数中添加新注册的pass
+```C++
+class Optimizer {
+ public:
+  void Run(Program&& program,
+           const std::vector<Place>& valid_places,
+           core::KernelPickFactor kernel_pick_factor,
+           const std::vector<std::string>& passes = {}) {
+           ...    
+    if (passes.empty()) {
+      RunPasses(std::vector<std::string>{
+          {"lite_fc_fuse_pass",                // the newly registered pass
+            ...
+           "argument_type_display_pass"}});
+    } else {
+      RunPasses(passes);
+    }
+    exec_scope_ = program.exec_scope();
+  }
+```
+（3） 以上修改完成后，在CreatePredictor（CxxConfig）创建CxxPredictor时，模型优化过程会调用`lite_fc_fuse_pass `，扫描`mul + element_wise add`结构并替换为等效的Fc_OP。
diff --git a/docs/advanced_user_guides/cv.md b/docs/advanced_user_guides/cv.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f53ac87564c80dcc15c5979a4212da5c3e730b8
--- /dev/null
+++ b/docs/advanced_user_guides/cv.md
@@ -0,0 +1,263 @@
+# CV 图像预处理API接口介绍
+
+请把编译脚本`Paddle-Lite/lite/too/build.sh`中`BUILD_CV`变量设置为`ON`， 其他编译参数设置请参考[源码编译](../source_compile)， 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去，且会生成`paddle_image_preprocess.h`的API文件
+
+- 硬件平台： `ARM`
+- 操作系统：`MAC` 和 `LINUX`
+
+## CV 图像预处理功能
+
+Lite 支持不同颜色空间的图像相互转换 `Convert` 、缩放 `Resize` 、翻转 `Flip`、旋转 `Rotate` 和图像数据转换为 `Tensor` 存储`ImageToTensor` 功能，下文将详细介绍每个功能的API接口。
+
+### CV 枚举变量和结构体变量
+
+- 颜色空间
+```cpp
+enum ImageFormat {
+  RGBA = 0,
+  BGRA,
+  RGB,
+  BGR,
+  GRAY,
+  NV21 = 11,
+  NV12,
+};
+```
+- 翻转参数
+```cpp
+enum FlipParam {
+  X = 0,  // flip along the X axis
+  Y,      // flip along the Y axis
+  XY      // flip along the XY axis
+};
+```
+- 转换参数
+```cpp
+typedef struct {
+  int ih;                // input height
+  int iw;                // input width
+  int oh;                // outpu theight
+  int ow;                // output width
+  FlipParam flip_param;  // flip, support x, y, xy
+  float rotate_param;    // rotate, support 90, 180, 270
+} TransParam;
+```
+
+### ImagePreprocess 类的成员变量
+
+`ImagePreprocess` 类含有以下三个私有成员变量，通过构造函数进行初始化。
+```cpp
+private:
+  ImageFormat srcFormat_; // input image color format
+  ImageFormat dstFormat_; // output image color format
+  TransParam transParam_; // image transform parameter
+
+// init
+ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, TransParam param) {
+  this->srcFormat_ = srcFormat;
+  this->dstFormat_ = dstFormat;
+  this->transParam_ = param;
+}
+```
+
+### 颜色空间转换 Convert
+
+`Convert` 函数支持颜色空间：GRAY、NV12（NV21）、RGB（BGR）和RGBA（BGRA）
+
++ 目前支持以下颜色空间的相互转换：
+    - GRAY2BGR
+    - GRAY2RGB
+    - BGR2RGB
+    - BGRA2BGR
+    - BGRA2RGB
+    - RGBA2RGB
+    - RGBA2BGR
+    - BGRA2RGBA
+
++ 目前支持以下颜色空间的单向转换：
+    - NV12—BGR
+    - NV21—BGR
+    - NV12—RGB
+    - NV21—RGB
+    - NV12—BGRA
+    - NV21—BGRA
+    - NV12—RGBA
+    - NV21—RGBA
+
++ `Convert` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageCovert(const uint8_t* src,
+    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat);
+    ```
+
+    + 第一个 `imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`srcFormat_`
+        - param dstFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+    
+    - 第二个`imageCovert` 接口，可以直接使用
+
+### 缩放 Resize
+
+`Resize` 功能支持颜色空间：GRAY、NV12（NV21）、RGB（BGR）和RGBA（BGRA）
+`Resize` 功能目前支持的方法：`bilinear`
+
++ `Resize` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, int dstw, int dsth);
+    ```
+
+    + 第一个`imageResize` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.iw`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.ih`
+        - param dstw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param dsth：`ImagePreprocess` 类的成员变量`transParam_.ow`
+    
+    - 第二个`imageResize` 接口，可以直接使用
+
+### 旋转 Rotate
+
+`Rotate` 功能支持颜色空间：GRAY、RGB（BGR）和RGBA（BGRA）
+`Rotate` 功能目前支持的角度：90、180 和 270
+
++ `Rotate` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, float degree);
+    ```
+
+    + 第一个`imageRotate` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.oh`
+        - param degree：`ImagePreprocess` 类的成员变量`transParam_.rotate_param`
+    
+    - 第二个`imageRotate` 接口，可以直接使用
+
+### 翻转 Flip
+
+`Flip` 功能支持颜色空间：GRAY、RGB（BGR）和RGBA（BGRA）
+`Flip` 功能目前支持的功能：沿X轴翻转、沿Y轴翻转和沿XY轴翻转
+
++ `Flip` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, FlipParam flip_param);
+    ```
+
+    + 第一个`imageFlip` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.oh`
+        - param flip_param：`ImagePreprocess` 类的成员变量`transParam_.flip_param`
+    
+    - 第二个`imageFlip` 接口，可以直接使用
+
+### Image2Tensor
+
+`Image2Tensor` 功能支持颜色空间：RGB（BGR）和RGBA（BGRA）
+`Image2Tensor` 功能目前支持的Layout：`NCHW`和 `NHWC`
+`Image2Tensor` 不仅完成图像转换为`Tensor`数据处理，而且还完成了图像数据的归一化处理
+
++ `Image2Tensor` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::image2Tensor(const uint8_t* src, Tensor* dstTensor, LayoutType layout, float* means, float* scales);
+    // 方法二
+    void ImagePreprocess::image2Tensor(const uint8_t* src, Tensor* dstTensor, ImageFormat srcFormat,  srcw, int srch, LayoutType layout, float* means, float* scales;
+    ```
+
+    + 第一个`image2Tensor` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.oh`
+    
+    - 第二个`image2Tensor` 接口，可以直接使用
+
+
+
+## CV 图像预处理 Demo 示例
+
+例子：输入 `1920x1080` 大小的 `NV12` 图像src，输出 `960x540` 大小 `RGB` 格式的图像dst；然后，完成 `90` 度旋转和沿 `X` 轴翻转功能；最后，用 `NHWC` 格式存储在Tensor里。
+
+定义 `ImagePreprocess` 类的对象，初始化成员变量
+
+```cpp
+// init
+srcFormat = ImageFormat::NV12;
+dstFormat = ImageFormat::RGB;
+srch = 1920;
+srcw = 1080;
+dsth = 960;
+dstw = 540;
+flip_param = FlipParam::X;
+degree = 90;
+layout = LayoutType::NHWC
+// 方法一: 
+TransParam tparam;
+tparam.ih = srch;
+tparam.iw = srcw;
+tparam.oh = dsth;
+tparam.ow = dstw;
+tparam.flip_param = flip_param;
+tparam.rotate_param = degree;
+ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+// 方法二: 
+ImagePreprocess image_preprocess();
+```
+
+### imageConvert Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageCovert(src, lite_dst);
+// 方法二: 
+image_preprocess.imageCovert(src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat);
+```
+
+### imageResize Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageResize(lite_dst, resize_tmp);
+// 方法二: 
+image_preprocess.imageResize(lite_dst,resize_tmp, (ImageFormat)dstFormat, srcw,
+srch, dstw, dsth);
+```
+
+### imageRotate Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageRotate(resize_tmp, tv_out_ratote);
+// 方法二: 
+image_preprocess.imageRotate(resize_tmp,tv_out_ratote, (ImageFormat)dstFormat, dstw, dsth, degree);
+```
+
+### imageFlip Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageFlip(tv_out_ratote, tv_out_flip);
+// 方法二: 
+image_preprocess.imageFlip(tv_out_ratote, tv_out_flip, (ImageFormat)dstFormat， dstw, dsth, flip_param);
+```
+
+### image2Tensor Demo
+
+```cpp
+// 方法一: 
+image_preprocess.image2Tensor(tv_out_flip, &dst_tensor, layout, means, scales);
+// 方法二: 
+image_preprocess.image2Tensor(tv_out_flip, &dst_tensor,(ImageFormat)dstFormat, dstw, dsth, layout, means, scales);
+```
diff --git a/docs/advanced_user_guides/npu.md b/docs/advanced_user_guides/npu.md
new file mode 100644
index 0000000000000000000000000000000000000000..c84a3c3bd151dbc1574a0d874bacfbcd0af330a3
--- /dev/null
+++ b/docs/advanced_user_guides/npu.md
@@ -0,0 +1,129 @@
+# 使用华为NPU
+
+Paddle Lite是首款支持华为自研达芬奇架构NPU（Kirin 810/990 SoC搭载的NPU）的预测框架。
+原理是在线分析Paddle模型，将Paddle算子转成HiAI IR后，调用HiAI IR/Builder/Runtime APIs生成并执行HiAI模型。
+
+## 已支持的设备
+
+- 华为nova5、nova5i pro、mate30、mate30 pro、mate30 5G、荣耀v30，以及即将推出的mate40、p40。据华为透露，今后上市的大部分手机都会搭载其自研达芬奇架构NPU。
+
+## 已支持的模型
+
+- MobileNetV1
+- MobileNetV2
+- ResNet-18/50
+- ShuffleNetV2
+- CycleGAN (暂时需要华为内部rom的支持)
+- 百度内部业务模型（由于涉密，不方便透露具体细节）
+
+## 已支持（或部分支持）的Paddle算子
+
+- sigmoid
+- relu
+- tanh
+- relu_clipped
+- leaky_relu
+- softsign
+- hard_sigmoid
+- batch_norm
+- concat
+- conv2d
+- depthwise_conv2d
+- conv2d_transpose
+- dropout
+- elementwise_add
+- elementwise_sub
+- elementwise_mul
+- elementwise_div
+- fusion_elementwise_add_activation
+- fusion_elementwise_sub_activation
+- fusion_elementwise_mul_activation
+- fusion_elementwise_div_activation
+- fc
+- bilinear_interp
+- nearest_interp
+- matmul
+- mul
+- pad2d
+- pool2d
+- reduce_mean
+- reshape
+- reshape2
+- scale
+- shuffle_channel
+- softmax
+- split
+- sqrt
+- square
+- transpose
+- transpose2
+- unsqueeze
+- unsqueeze2
+- instance_norm (暂时需要华为内部rom的支持)
+- layer_norm (暂时需要华为内部rom的支持)
+
+## 编译支持NPU的Paddle Lite库
+
+- 从https://developer.huawei.com/consumer/cn/hiai/下载华为HiAI DDK后解压到任意路径（注意：华为提供了多个版本的DDK，我们需要下载针对麒麟810/990芯片HiAI Foundation开发套件，例如最新的[DDK V310版本](https://obs.cn-north-2.myhwclouds.com/hms-ds-wf/sdk/hwhiai-ddk-100.310.011.010.zip)）。
+- 将HiAI DDK中的ai_ddk_lib目录拷贝至Paddle Lite源码根目录后，使用[NPU编译脚本](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/tools/build_npu.sh)编译full_publish和tiny_publish。
+
+注意：以下是HiAI DDK V310版解压后的目录结构，需要将ai_ddk_lib目录拷贝至Paddle Lite源码根目录。
+```shell
+- app_sample
+- ddk
+  - ai_ddk_lib
+    - include
+    - lib # for armv7
+    - lib64 # for armv8
+- document
+- tools
+```
+
+- full_publish and tiny_publish for armv8，由于HiAI DDK的armv7和armv8的so库均基于c++_shared构建，因此，建议使用c++_shared编译Paddle Lite。
+```shell
+$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared full_publish
+$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared tiny_publish
+```
+
+- full_publish and tiny_publish for armv7
+```shell
+$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared full_publish
+$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared tiny_publish
+```
+
+注意：为了保证编译环境一致，建议参考[源码编译](../installation/source_compile)中的Docker开发环境进行配置，然后再执行上述命令。
+
+## 优化生成NPU模型
+
+- model_optimize_tool工具已经支持生成NPU模型，仅需要将valid_targets设置为npu,arm即可，具体参考[模型转化方法](../user_guides/model_optimize_tool)。
+```shell
+./model_optimize_tool --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=npu,arm \
+    --prefer_int8_kernel=(true|false) \
+    --record_tailoring_info =(true|false)
+```
+- model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子，并没有真正生成NPU HiAI模型，只有在执行时才会将标记的Paddle算子转成HiAI IR，最终生成并执行HiAI模型，具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。
+- 不同模型，不同型号（ROM版本）的华为手机，在执行阶段，由于某些Paddle算子无法完全转成HiAI IR，或目标手机的HiAI版本过低等原因，可能导致HiAI模型无法成功生成，在这种情况下，Paddle Lite会调用CPU版算子进行运算完成整个预测任务。
+
+## 通过JAVA接口加载并执行NPU模型
+
+- 使用方法和[Java实例](../user_guides/java_demo)一致，无需额外设置任何参数，只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。
+
+注意：在拷贝libpaddle_lite_jni.so的时候，由于依赖HiAI DDK so和libc++_shared.so库，需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so，拷到libpaddle_lite_jni.so同级目录下。
+
+## 通过C++接口加载并执行NPU模型
+
+- 使用方法和[C++实例](../user_guides/cpp_demo)一致，同样无需额外设置任何参数，只需将模型换成NPU模型即可。
+
+注意：1）不能使用安卓模拟器，需要使用真实设备，且必须是支持NPU的华为手机。2）在使用adb push命令向手机推送目标程序时，需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so，推送到目标程序同级目录下。
+
+
+## 其它说明
+
+- 华为达芬奇架构的NPU内部大量采用float16进行运算，因此，预测结果会存在偏差，但大部分情况下精度不会有较大损失，可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。
+- 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU，与Kirin 970/980 Soc搭载的寒武纪NPU不一样，同样的，与Hi3559A、Hi3519A使用的NNIE也不一样，Paddle Lite只支持华为自研达芬奇架构NPU。
+- 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter，以便适配更多Paddle模型，同时华为研发同学也在持续对HiAI IR性能进行优化。
diff --git a/docs/advanced_user_guides/support_operation_list.md b/docs/advanced_user_guides/support_operation_list.md
index 7c2ceb0ff819f7f1676308a33ec88f5eab820e57..c0acb02b9d7fb71f8abf79a651e07f2d78c1d2c1 100644
--- a/docs/advanced_user_guides/support_operation_list.md
+++ b/docs/advanced_user_guides/support_operation_list.md
@@ -39,6 +39,7 @@
 - expand
 - fake_channel_wise_dequantize_max_abs
 - fake_dequantize_max_abs
+- fake_quantize_abs_max
 - fake_quantize_dequantize_moving_average_abs_max
 - fake_quantize_moving_average_abs_max
 - fake_quantize_range_abs_max
@@ -56,10 +57,11 @@
 - fusion_elementwise_mul_activation
 - fusion_elementwise_sub_activation
 - gather
+- gelu
 - generate_proposals
-- graph_op
 - greater_equal
 - greater_than
+- grid_sampler
 - gru
 - gru_unit
 - hard_sigmoid
@@ -93,7 +95,7 @@
 - nearest_interp
 - negative
 - norm
-- notequal
+- not_equal
 - pad2d
 - pool2d
 - power
@@ -127,6 +129,7 @@
 - sequence_expand
 - sequence_expand_as
 - sequence_pool
+- sequence_pool_concat
 - sequence_reshape
 - sequence_reverse
 - sequence_softmax
@@ -144,6 +147,7 @@
 - squeeze
 - squeeze2
 - stack
+- subgraph
 - swish
 - tanh
 - top_k
@@ -216,6 +220,7 @@
 - generate_proposals
 - greater_equal
 - greater_than
+- grid_sampler
 - gru
 - gru_unit
 - hard_sigmoid
@@ -301,6 +306,9 @@
 - gelu
 - gru
 - layer_norm
+- leaky_relu
+- lookup_table
+- lookup_table_v2
 - match_matrix_tensor
 - matmul
 - mul
@@ -378,9 +386,11 @@
 - yolo_box
 
 ### OpenCL kernels
+- concat
 - conv2d
 - depthwise_conv2d
 - elementwise_add
+- elementwise_mul
 - fc
 - fusion_elementwise_add_activation
 - layout
@@ -388,5 +398,10 @@
 - io_copy
 - io_copy_once
 - mul
+- nearest_interp
 - pool2d
 - relu
+- reshape
+- reshape2
+- scale
+- sigmoid
diff --git a/docs/advanced_user_guides/x86.md b/docs/advanced_user_guides/x86.md
new file mode 100644
index 0000000000000000000000000000000000000000..7cb08683440312b0349662699b05e99df0cb6df1
--- /dev/null
+++ b/docs/advanced_user_guides/x86.md
@@ -0,0 +1,104 @@
+# 使用X86预测库
+
+Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../installation/source_compile)。
+
+(注意：非docker Linux环境需要是Ubuntu16.04)
+
+## 编译
+
+1、 下载代码
+```bash
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+#需要切换到 release/v2.0.0之后版本
+git checkout <release_tag>
+```
+
+2、 源码编译
+
+```bash
+cd Paddle-Lite
+./lite/tools/build.sh x86
+```
+
+## 编译结果说明
+
+x86编译结果位于 `build.lite.x86/inference_lite_lib`
+**具体内容**说明：
+
+1、 `bin`文件夹：可执行工具文件 `test_model_bin`
+
+2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+
+- `include`  : 头文件
+- `lib` : 库文件
+  - 打包的静态库文件：
+    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
+    - `libpaddle_api_light_bundled.a` ：只包含 light_api 功能的静态库
+  - 打包的动态态库文件：
+    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
+    - `libpaddle_light_api_shared.so`：只包含 light_api 功能的动态库
+
+3、 `third_party` 文件夹：第三方库文件
+
+## x86预测API使用示例
+
+```c++
+#include <gflags/gflags.h>
+#include <iostream>
+#include <vector>
+#include "paddle_api.h"          // NOLINT
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_use_passes.h"   // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+DEFINE_string(optimized_model_dir, "", "Optimized model dir.");
+DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+void RunModel() {
+  // 1. Set CxxConfig
+  CxxConfig config;
+  config.set_model_file(FLAGS_model_dir + "model");
+  config.set_param_file(FLAGS_model_dir + "params");
+
+  config.set_valid_places({
+    lite_api::Place{TARGET(kX86), PRECISION(kFloat)}
+  });
+
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1, 3, 224, 224}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  std::cout << "Output dim: " << output_tensor->shape()[1] << std::endl;
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+    std::cout << "Output[" << i << "]:" << output_tensor->data<float>()[i] << std::endl;
+  }
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
+```
diff --git a/docs/api_reference/cxx_api_doc.md b/docs/api_reference/cxx_api_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..38385a4267d5727d9c5c7d985d3457dd011e203c
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc.md
@@ -0,0 +1,874 @@
+
+# C++ API文档
+
+## CreatePaddlePredictor
+
+```c++
+template <typename ConfigT>
+std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
+```
+
+`CreatePaddlePredictor`用来根据`MobileConfig`构建预测器。
+
+示例：
+
+```c++
+// 设置MobileConfig
+MobileConfig config;
+config.set_model_dir(FLAGS_model_dir);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+参数：
+
+- `config(MobileConfig)` - 用于构建Predictor的配置信息。
+
+返回：`PaddlePredictor`指针
+
+返回类型：`std::shared_ptr<PaddlePredictor>`
+
+## CxxConfig
+
+```c++
+class CxxConfig;
+```
+
+`CxxConfig`用来配置构建CxxPredictor的配置信息，如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。
+
+示例：
+
+```c++
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置工作线程数
+config.set_threads(4);
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+# 设置valid places
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+### `set_model_dir(model_dir)`
+
+设置模型文件夹路径，当需要从磁盘加载非combined模型时使用。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`str`
+
+
+
+### `set_model_file(model_file)`
+
+设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `model_file()`
+
+获取设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型文件路径
+
+返回类型：`str`
+
+
+
+### `set_param_file(param_file)`
+
+设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `param_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `param_file()`
+
+获取设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型参数文件路径
+
+返回类型：`str`
+
+
+
+### `set_valid_places(valid_places)`
+
+设置可用的places列表。
+
+参数：
+
+- `valid_places(list)` - 可用place列表。
+
+返回类型：`None`
+
+示例：
+
+```c++
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置valid places
+# 注意，valid_places列表中Place的排序表明了用户对Place的偏好程度，如用户想优先使用ARM上Int8精度的
+# kernel，则应把Place(TargetType.ARM, PrecisionType.INT8)置于valid_places列表的首位。
+places = [Place(TargetType.ARM, PrecisionType.INT8),
+          Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `threads()`
+
+获取设置的工作线程数。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+
+### `set_x86_math_library_num_threads(threads)`
+
+设置CPU Math库线程数，CPU核心数支持情况下可加速预测。默认为1，并且仅在x86下有效。
+
+参数：
+
+- `threads(int)` - CPU Math库线程数。
+
+返回：`None`
+
+返回类型：`None`
+
+
+### `x86_math_library_num_threads()`
+
+返回CPU Math库线程数，CPU核心数支持情况下可加速预测。仅在x86下有效。
+
+参数：
+
+- `None`
+
+返回：CPU Math库线程数。
+
+返回类型：`int`
+
+## MobileConfig
+
+```c++
+class MobileConfig;
+```
+
+`MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息，如NaiveBuffer格式的模型地址、模型的内存地址(从内存加载模型时使用)、能耗模式、工作线程数等等。
+
+*注意：输入的模型需要使用[Model Optimize Tool](../model_optimize_tool)转化为NaiveBuffer格式的优化模型。*
+
+示例：
+
+```c++
+MobileConfig config;
+// 设置NaiveBuffer格式模型目录，从文件加载模型时使用
+config.set_model_dir(FLAGS_model_dir);
+// 设置工作线程数
+config.set_threads(4);
+// 设置能耗模式
+config.set_power_mode(LITE_POWER_HIGH);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+### `set_model_from_file(model_dir)`
+
+设置模型文件，当需要从磁盘加载模型时使用。
+
+参数：
+
+- `model_dir(std::string)` - 模型文件路径
+
+返回：`None`
+
+返回类型：`void`
+
+### `set_model_dir(model_dir)`
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`set_model_from_file`接口。
+
+设置模型文件夹路径，当需要从磁盘加载模型时使用。
+
+参数：
+
+- `model_dir(std::string)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`std::string`
+
+### `set_model_from_buffer(model_buffer)`
+
+设置模型的内存数据，当需要从内存加载模型时使用。
+
+参数：
+
+- `model_buffer(std::string)` - 内存中的模型数据
+
+返回：`None`
+
+返回类型：`void`
+
+### `set_model_buffer(model_buffer, model_buffer_size, param_buffer, param_buffer_size)`
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`set_model_from_buffer`接口。
+
+设置模型、参数的内存地址，当需要从内存加载模型时使用。
+
+示例：
+
+```c++
+// 读取模型文件到内存
+std::string model_buffer = ReadFile(FLAGS_model_path);
+std::string params_buffer = lite::ReadFile(FLAGS_params_path);
+
+// 设置MobileConfig
+lite_api::MobileConfig config;
+config.set_model_buffer(model_buffer.c_str(), model_buffer.size(), 
+                        params_buffer.c_str(), params_buffer.size());
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+参数：
+
+- `model_buffer(const char*)` - 内存中模型结构数据。
+- `model_buffer_size(size_t)` - 内存中模型结构数据的大小。
+- `param_buffer(const char*)` - 内存中模型参数数据。
+- `param_buffer_size(size_t)` - 内存中模型参数数据的大小。
+
+返回：`None`
+
+返回类型：`Void`
+
+
+
+### `model_from_memory()`
+
+是否从内存中加载模型，当使用`set_model_buffer`接口时返回`true`
+
+参数：
+
+- `None`
+
+返回：是否从内存加载模型
+
+返回类型：`bool`
+
+
+
+### `model_buffer()`
+
+获取内存中模型结构数据。
+
+参数：
+
+- `None`
+
+返回：内存中模型结构数据
+
+返回类型：`const std::string&`
+
+
+
+### `param_buffer()`
+
+获取内存中模型参数数据。
+
+参数：
+
+- `None`
+
+返回：内存中模型结构数据
+
+返回类型：`const std::string&`
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式。
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `threads()`
+
+获取设置的工作线程数。
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+## PaddlePredictor
+
+```c++
+class PaddlePredictor
+```
+
+`PaddlePredictor`是Paddle-Lite的预测器，由`CreatePaddlePredictor`根据`MobileConfig`进行创建。用户可以根据PaddlePredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```c++
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+// 设置MobileConfig
+MobileConfig config;
+config.set_model_dir(FLAGS_model_dir);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+// 获得模型的输入和输出名称
+std::vector<std::string> input_names = predictor->GetInputNames();
+for (int i = 0; i < input_names.size(); i ++) {
+  printf("Input name[%d]: %s\n", i, input_names[i].c_str());
+}
+std::vector<std::string> output_names = predictor->GetOutputNames();
+for (int i = 0; i < output_names.size(); i ++) {
+  printf("Output name[%d]: %s\n", i, output_names[i].c_str());
+}
+
+// 准备输入数据
+// (1)根据index获取输入Tensor
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+// (2)根据名称获取输入Tensor
+// std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInputByName(input_names[0])));
+input_tensor->Resize({1, 3, 224, 224});
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 执行预测
+predictor->Run();
+
+// 获取输出
+// (1)根据index获取输出Tensor
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+// (2)根据名称获取输出Tensor
+// std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(output_names[0])));
+printf("Output dim: %d\n", output_tensor->shape()[1]);
+for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+  printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+}
+```
+
+### `GetInput(index)`
+
+获取输入Tensor指针，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`的指针
+
+返回类型：`std::unique_ptr<Tensor>`
+
+
+
+### `GetOutput(index)`
+
+获取输出Tensor的指针，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出Tensor`的指针
+
+返回类型：`std::unique_ptr<Tensor>`
+
+### `GetInputNames()`
+
+获取所有输入Tensor的名称。
+
+参数：
+
+- `None` 
+
+返回：所有输入Tensor的名称
+
+返回类型：`std::vector<std::string>`
+
+### `GetOutputNames()`
+
+获取所有输出Tensor的名称。
+
+参数：
+
+- `None`
+
+返回：所有输出Tensor的名称
+
+返回类型：`std::vector<std::string>`
+
+### `GetInputByName(name)`
+
+根据名称获取输出Tensor的指针，用来获取模型的输出结果。
+
+参数：
+
+- `name(const std::string)` - 输入Tensor的名称
+
+返回：输入Tensor`的指针
+
+返回类型：`std::unique_ptr<Tensor>`
+
+### `GetTensor(name)`
+
+根据名称获取输出Tensor的指针。
+
+**注意**：`GetTensor`接口是为开发者设计的调试接口，可以输出[转化](../model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`，可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。
+
+参数：
+
+- `name(const std::string)` - Tensor的名称
+
+返回：指向`const Tensor`的指针
+
+返回类型：`std::unique_ptr<const Tensor>`
+
+### `Run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `GetVersion()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`std::string`
+
+## TargetType
+
+```c++
+class TargetType;
+```
+`TargetType`为目标设备硬件类型，用户可以根据应用场景选择硬件平台类型。
+
+枚举型变量`TargetType`的所有可能取值包括：
+
+`{X86, CUDA, ARM, OpenCL, FPGA, NPU}`
+
+
+## PrecisionType
+```c++
+class PrecisionType {FP32};
+```
+`PrecisionType`为模型中Tensor的数据精度，默认值为FP32(float32)。
+
+枚举型变量`PrecisionType`的所有可能取值包括：
+
+`{FP32, INT8, INT32, INT64}`
+
+
+
+
+## DataLayoutType
+
+```c++
+class DataLayoutType {NCHW};
+```
+`DataLayoutType`为Tensor的数据格式，默认值为NCHW（number, channel, height, weigth）。
+
+枚举型变量`DataLayoutType`的所有可能取值包括：
+
+` {NCHW, NHWC}`
+
+
+
+## Place
+```c++
+class Place{
+  TargetType target;
+  PrecisionType precision{FP32};
+  DataLayoutType layout{NCHW}
+}
+```
+`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合，说明运行时的设备类型、数据精度和数据格式。
+
+示例：
+```C++
+Place{TargetType(ARM), PrecisionType(FP32), DataLayoutType(NCHW)}
+```
+
+## PowerMode
+
+```c++
+enum PowerMode;
+```
+
+`PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。
+
+示例：
+
+```c++
+MobileConfig config;
+// 设置NaiveBuffer格式模型目录
+config.set_model_dir(FLAGS_model_dir);
+// 设置能耗模式
+config.set_power_mode(LITE_POWER_HIGH);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+```
+
+PowerMode详细说明如下：
+
+|         选项         | 说明                                                         |
+| :------------------: | ------------------------------------------------------------ |
+|   LITE_POWER_HIGH    | 绑定大核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Big cluster。如果设置的线程数大于大核数量，则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败，如果失败则进入不绑核模式。 |
+|    LITE_POWER_LOW    | 绑定小核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Little cluster。如果设置的线程数大于小核数量，则会将线程数自动缩放到小核数量。如果找不到小核，则自动进入不绑核模式。 |
+|   LITE_POWER_FULL    | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时，则会自动将线程数缩放到核心数量。 |
+|  LITE_POWER_NO_BIND  | 不绑核运行模式（推荐）。系统根据负载自动调度任务到空闲的CPU核心上。 |
+| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+| LITE_POWER_RAND_LOW  | 轮流绑定小核模式。如果Little cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+
+
+
+## Tensor
+
+```c++
+class Tensor
+```
+
+Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置Shape、数据、LoD信息等。
+
+*注意：用户应使用`PaddlePredictor`的`GetInput`和`GetOuput`接口获取输入/输出的`Tensor`。*
+
+示例：
+
+```c++
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+// 设置MobileConfig
+MobileConfig config;
+config.set_model_dir(FLAGS_model_dir);
+
+// 根据MobileConfig创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+// 准备输入数据, 获取输入Tensor
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+// 设置输入Tensor维度信息
+input_tensor->Resize({1, 3, 224, 224});
+// 设置输入数据
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 执行预测
+predictor->Run();
+
+// 获取输出Tensor
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+// 获取输出Tensor维度
+printf("Output dim: %d\n", output_tensor->shape()[1]);
+// 获取输出Tensor数据
+for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+  printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+}
+```
+
+### `Resize(shape)`
+
+设置Tensor的维度信息。
+
+参数：
+
+- `shape(std::vector<int64_t>)` - 维度信息
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `shape()`
+
+获取Tensor的维度信息。
+
+参数：
+
+- `None`
+
+返回：Tensor的维度信息
+
+返回类型：`std::vector<int64_t>`
+
+
+
+### `data<T>()`
+
+```c++
+template <typename T>
+const T* data() const;
+```
+
+获取Tensor的底层数据的常量指针，根据传入的不同模型类型获取相应数据。用于读取Tensor数据。
+
+示例：
+
+```c++
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+// 如果模型中输出为float类型
+output_tensor->data<float>()
+```
+
+参数：
+
+- `None`
+
+返回：`Tensor`底层数据常量指针
+
+返回类型：`const T*`
+
+
+
+### `mutable_data<T>()`
+
+```c++
+template <typename T>
+T* mutable_data() const;
+```
+
+获取Tensor的底层数据的指针，根据传入的不同模型类型获取相应数据。用于设置Tensor数据。
+
+示例：
+
+```c++
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+// 如果模型中输出为float类型
+auto* data = input_tensor->mutable_data<float>();
+// 设置Tensor数据
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+```
+
+参数：
+
+- `None`
+
+返回：`Tensor`底层数据指针
+
+返回类型：`T*`
+
+
+
+### `SetLoD(lod)`
+
+设置Tensor的LoD信息。
+
+参数：
+
+- `lod(std::vector<std::vector<uint64_t>>)` - Tensor的LoD信息
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `lod()`
+
+获取Tensor的LoD信息
+
+参数：
+
+- `None`
+
+返回：`Tensor`的LoD信息
+
+返回类型：`std::vector<std::vector<uint64_t>>`
diff --git a/docs/index.rst b/docs/index.rst
index c44c4e171d128c1469343cd01a91cc1d12762b8a..9f9a2be8c9a34901cabc9f69d21de4fa57cc9057 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,6 +13,7 @@ Welcome to Paddle-Lite's documentation!
 
   introduction/tech_highlights
   introduction/architecture
+  introduction/support_hardware
 
 .. toctree::
   :maxdepth: 1
@@ -34,7 +35,13 @@ Welcome to Paddle-Lite's documentation!
   :caption: 使用指南
   :name: sec-user-guides
 
+  user_guides/model_optimize_tool
+  user_guides/library_tailoring
   user_guides/cuda
+  user_guides/fpga
+  user_guides/opencl
+  user_guides/cpp_demo
+  user_guides/java_demo
 
 .. toctree::
   :maxdepth: 1
@@ -42,7 +49,12 @@ Welcome to Paddle-Lite's documentation!
 
   advanced_user_guides/support_operation_list
   advanced_user_guides/add_operation
+  advanced_user_guides/add_layout
   advanced_user_guides/model_quantization
+  advanced_user_guides/add_new_pass
+  advanced_user_guides/npu
+  advanced_user_guides/x86
+  advanced_user_guides/cv
   
 .. toctree::
   :maxdepth: 1
@@ -52,6 +64,8 @@ Welcome to Paddle-Lite's documentation!
   :maxdepth: 1
   :caption: API文档
 
+  api_reference/cxx_api_doc
+
 .. toctree::
   :maxdepth: 1
   :caption: FAQ
diff --git a/docs/installation/library.md b/docs/installation/library.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef2f8fdb18ade439d620b348738cbb752d5bd8b6
--- /dev/null
+++ b/docs/installation/library.md
@@ -0,0 +1,61 @@
+
+# 预测库说明
+
+Paddle-Lite的编译结果为预测库文件（包括静态库和动态库），具体编译过程参考[源码编译](./source_compile)。
+
+Lite预测库分为**基础预测库**和**全量预测库**：基础预测库只打包了基础模型需要的基础算子，预测库体积较小；全量预测库打包了所有的Lite算子，可以支持更多的模型，但是预测库的体积也更大。 编译时由编译选项 `build_extra`(默认为OFF)控制，`--build_extra=OFF`时编译基础预测库，`--build_extra=ON`时编译全量的预测库。
+
+## 基础预测库
+
+### 编译方法
+编译时设置`--build_extra=OFF` (默认值) 或不指定即可编译出基础预测库。例如：
+
+```
+./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static  tiny_publish
+```
+
+### 基础预测库支持的功能
+
+（1）支持基础CV模型
+
+（2）支持基础的in8量化模型
+
+（3）支持[benchmark测试](../benchmark/benchmark)
+
+
+### 基础预测库支持的基础模型：
+
+1. fluid基础模型（paddle model 提供的基础模型9个）
+
+```
+mobileNetV1     mnasnet     yolov3   ssd_mobilenetv1    shufflenet_v2
+mobileNetV2     resnet50    unet     squeezenet_v11
+```
+
+2. int8量化模型模型
+
+```
+mobilenet_v1   mobilenet_v2   resnet50
+```
+
+### 特点
+  轻量级预测库，体积更小，支持常用的基础模型。
+
+
+
+## 全量预测库
+
+### 编译方法
+编译时设置`--build_extra=ON` 即可编译出全量预测库。例如：
+
+```
+./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static --build_extra=ON tiny_publish
+```
+### 全量预测库功能
+
+（1） 基础预测库所有功能
+
+（2）支持所有Paddle-Lite中注册的所有算子
+
+### 特点
+  支持更多的硬件平台和算子，可以支持更多模型但体量更大。
diff --git a/docs/installation/source_compile.md b/docs/installation/source_compile.md
index c0a86d92b6eba5526992031f36441fb8cc4fb537..f2016b83188b755eca8daab8a4aa38b25e08c0f1 100644
--- a/docs/installation/source_compile.md
+++ b/docs/installation/source_compile.md
@@ -1,7 +1,415 @@
+
 # 源码编译
 
+Paddle-Lite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`，编译流程如下：
+
+1. 环境准备（选择其一）：Docker交叉编译环境、Linux交叉编译环境
+2. 编译：调用`build.sh`脚本一键编译
+
+## 一、环境准备
+
 目前支持三种编译的环境：
 
 1. Docker 容器环境，
 2. Linux（推荐 Ubuntu 16.04）环境，
 3. Mac OS 环境。
+
+### 1、 Docker开发环境
+
+[Docker](https://www.docker.com/) 是一个开源的应用容器引擎, 使用沙箱机制创建独立容器，方便运行不同程序。Docker初学者可以参考[Docker使用方法](https://thenewstack.io/docker-station-part-one-essential-docker-concepts-tools-terminology/)正确安装Docker。
+
+#### 准备Docker镜像
+
+有两种方式准备Docker镜像，推荐从Dockerhub直接拉取Docker镜像
+
+```shell
+# 方式一：从Dockerhub直接拉取Docker镜像
+docker pull paddlepaddle/paddle-lite:2.0.0_beta
+
+# 方式二：本地源码编译Docker镜像
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite/lite/tools
+mkdir mobile_image
+cp Dockerfile.mobile mobile_image/Dockerfile
+cd mobile_image
+docker build -t paddlepaddle/paddle-lite .
+
+# 镜像编译成功后，可用`docker images`命令，看到`paddlepaddle/paddle-lite`镜像。
+```
+
+#### 进入Docker容器
+
+在拉取Paddle-Lite仓库代码的上层目录，执行如下代码，进入Docker容器：
+
+```shell
+docker run -it \
+  --name paddlelite_docker \
+  -v $PWD/Paddle-Lite:/Paddle-Lite \
+  --net=host \
+  paddlepaddle/paddle-lite /bin/bash
+```
+
+该命令的含义：将容器命名为`paddlelite_docker`即`<container-name>`，将当前目录下的`Paddle-Lite`文件夹挂载到容器中的`/Paddle-Lite`这个根目录下，并进入容器中。至此，完成Docker环境的准备。
+
+#### Docker常用命令
+
+```shell
+# 退出容器但不停止/关闭容器：键盘同时按住三个键：CTRL + q + p
+
+# 启动停止的容器
+docker start <container-name>
+
+# 从shell进入已启动的容器
+docker attach <container-name>
+
+# 停止正在运行的Docker容器
+docker stop <container-name>
+
+# 重新启动正在运行的Docker容器
+docker restart <container-name>
+
+# 删除Docker容器
+docker rm <container-name>
+```
+
+### 2、Linux 开发环境
+
+#### Android
+
+##### 交叉编译环境要求
+
+- gcc、g++、git、make、wget、python、adb
+- Java environment
+- cmake（建议使用3.10或以上版本）
+- Android NDK (建议ndk-r17c)
+
+##### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip adb curl
+
+# 2. Prepare Java env.
+apt-get install -y default-jdk
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+    tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+    mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \  
+    ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+    ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+
+# 4. Download Android NDK for linux-x86_64
+#     Note: Skip this step if NDK installed
+#     recommand android-ndk-r17c-darwin-x86_64
+#     ref: https://developer.android.com/ndk/downloads
+cd /tmp && curl -O https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
+cd /opt && unzip /tmp/android-ndk-r17c-linux-x86_64.zip
+
+# 5. Add environment ${NDK_ROOT} to `~/.bashrc` 
+echo "export NDK_ROOT=/opt/android-ndk-r17c" >> ~/.bashrc
+source ~/.bashrc
+```
+
+#### ARM Linux
+
+适用于基于 ARMv8 和 ARMv7 架构 CPU 的各种开发板，例如 RK3399，树莓派等，目前支持交叉编译和本地编译两种方式，对于交叉编译方式，在完成目标程序编译后，可通过 scp 方式将程序拷贝到开发板运行。
+
+##### 交叉编译
+
+###### 编译环境要求
+
+- gcc、g++、git、make、wget、python、scp
+- cmake（建议使用3.10或以上版本）
+
+###### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip
+
+# 2. Install arm gcc toolchains
+apt-get install -y --no-install-recommends \
+  g++-arm-linux-gnueabi gcc-arm-linux-gnueabi \
+  g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf \
+  gcc-aarch64-linux-gnu g++-aarch64-linux-gnu 
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+    tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+    mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \  
+    ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+    ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+```
+
+##### 本地编译（直接在RK3399或树莓派上编译）
+
+###### 编译环境要求
+
+- gcc、g++、git、make、wget、python
+- cmake（建议使用3.10或以上版本）
+
+###### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版本类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recomends \
+  gcc g++ make wget python unzip
+
+# 2. install cmake 3.10 or above
+wget https://www.cmake.org/files/v3.10/cmake-3.10.3.tar.gz
+tar -zxvf cmake-3.10.3.tar.gz
+cd cmake-3.10.3
+./configure
+make
+sudo make install
+```
+
+之后可通过cmake --version查看cmake是否安装成功。
+
+至此，完成 Linux 交叉编译环境的准备。
+
+### 3、Mac OS 开发环境
+
+#### 交叉编译环境要求
+
+- gcc、git、make、curl、unzip、java
+- cmake（Android编译请使用3.10版本，IOS编译请使用3.15版本）
+- 编译Android: Android NDK (建议ndk-r17c)
+- 编译IOS: XCode(Version 10.1)
+
+#### 具体步骤
+
+```bash
+# 1. Install basic software
+brew install  curl gcc git make unzip wget
+
+# 2. Install cmake: mac上实现IOS编译和Android编译要求的cmake版本不一致,可以根据需求选择安装。
+# （1）在mac环境编译 Paddle-Lite 的Android版本，需要安装cmake 3.10
+#     mkdir /usr/local/Cellar/cmake/ && cd /usr/local/Cellar/cmake/
+#     wget https://cmake.org/files/v3.10/cmake-3.10.2-Darwin-x86_64.tar.gz
+#     tar zxf ./cmake-3.10.2-Darwin-x86_64.tar.gz
+#     mv cmake-3.10.2-Darwin-x86_64/CMake.app/Contents/ ./3.10.2
+#     ln -s /usr/local/Cellar/cmake/3.10.2/bin/cmake /usr/local/bin/cmake
+# （2）在mac环境编译 Paddle-Lite 的IOS版本，需要安装cmake 3.15
+#     mkdir /usr/local/Cellar/cmake/ && cd /usr/local/Cellar/cmake/
+#     cd /usr/local/Cellar/cmake/
+#     wget https://cmake.org/files/v3.15/cmake-3.15.2-Darwin-x86_64.tar.gz
+#     tar zxf ./cmake-3.15.2-Darwin-x86_64.tar.gz
+#     mv cmake-3.15.2-Darwin-x86_64/CMake.app/Contents/ ./3.15.2
+#     ln -s /usr/local/Cellar/cmake/3.15.2/bin/cmake /usr/local/bin/cmake
+
+# 3. Download Android NDK for Mac
+#     recommand android-ndk-r17c-darwin-x86_64
+#     ref: https://developer.android.com/ndk/downloads
+#     Note: Skip this step if NDK installed
+cd ~/Documents && curl -O https://dl.google.com/android/repository/android-ndk-r17c-darwin-x86_64.zip
+cd ~/Library && unzip ~/Documents/android-ndk-r17c-darwin-x86_64.zip
+
+# 4. Add environment ${NDK_ROOT} to `~/.bash_profile` 
+echo "export NDK_ROOT=~/Library/android-ndk-r17c" >> ~/.bash_profile
+source ~/.bash_profile
+
+# 5. Install Java Environment 
+brew cask install java
+
+# 6. 编译IOS需要安装XCode(Version 10.1)，可以在App Store里安装。安装后需要启动一次并执行下面语句。
+# sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+```
+
+至此，完成 Mac 交叉编译环境的准备。
+
+**注意**: Mac上编译Paddle-Lite的full_publish版本时，Paddle-Lite所在路径中不可以含有中文字符
+
+## 二、编译PaddleLite
+
+### 下载代码
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+git checkout <release-version-tag>
+```
+
+### 编译模式与参数
+
+编译脚本`./lite/tools/build.sh`，支持三种编译模式：
+
+| 编译模式 | 介绍 | 适用对象 |
+|:-------:|-----|:-------:|
+| tiny_publish | 编译移动端部署库，无第三方库依赖 | 用户 |
+| full_publish | 编译移动端部署库，有第三方依赖如protobuf、glags等，含有可将模型转换为无需protobuf依赖的naive buffer格式的工具，供tiny_publish库使用 | 用户 |
+| test | 编译指定`arm_os`、`arm_abi`下的移动端单元测试 | 框架开发者 |
+
+编译脚本`./lite/tools/build.sh`，追加参数说明：
+
+|   参数     |     介绍     |     值     |
+|-----------|-------------|-------------|
+| --arm_os   |必选，选择安装平台     | `android`、`ios`、`ios64`、`armlinux` |
+| --arm_abi  |必选，选择编译的arm版本，其中`armv7hf`为ARMLinux编译时选用| `armv8`、`armv7`、`armv7hf`(仅`armlinux`支持) |
+| --arm_lang |arm_os=android时必选，选择编译器 | `gcc`、`clang`(`clang`当前暂不支持) |
+| --android_stl |arm_os=android时必选，选择静态链接STL或动态链接STL | `c++_static`、`c++_shared`|
+| --build_java | 可选，是否编译java预测库（默认为OFF） | `ON`、`OFF` |
+| --build_extra | 可选，是否编译全量预测库（默认为OFF）。详情可参考[预测库说明](./library.html)。 | `ON`、`OFF` |
+| target |必选，选择编译模式，`tiny_publish`为编译移动端部署库、`full_publish`为带依赖的移动端部署库、`test`为移动端单元测试、`ios`为编译ios端`tiny_publish` | `tiny_publish`、`full_publish`、`test`、 `ios` |
+
+### 编译代码
+
+**<font color="orange" >注意</font>**<font color="orange" >：非开发者建议在编译前使用</font>[**“加速第三方依赖库的下载”**](#id22)<font color="orange" >的方法，加速工程中第三方依赖库的下载与编译。 </font>
+
+#### 编译`tiny publish`动态库
+
+##### Android
+```shell
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --build_extra=OFF \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=OFF \
+  tiny_publish
+```
+##### IOS
+```shell
+./lite/tools/build.sh \
+  --arm_os=ios64 \
+  --arm_abi=armv8 \
+  --build_extra=OFF \
+  ios
+```
+**注意：mac环境编译IOS 时，cmake版本需要高于cmake 3.15；mac环境上编译Android时，cmake版本需要设置为cmake 3.10。**
+
+ios tiny publish支持的编译选项：
+
+* `--arm_os`: 可选ios或者ios64
+* `--arm_abi`: 可选armv7和armv8（**注意**：当`arm_os=ios`时只能选择`arm_abi=armv7`，当`arm_os=ios64`时只能选择`arm_abi=armv8`）
+* 如果mac编译过程中报错："Invalid CMAKE_DEVELOPER_ROOT: does not exist", 运行：
+```shell
+sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+```
+##### ARMLinux
+```shell
+./lite/tools/build.sh \
+  --build_extra=OFF \
+  --arm_os=armlinux \
+  --arm_abi=armv7hf \
+  --arm_lang=gcc \
+  --build_extra=OFF \
+  tiny_publish
+```
+- `--arm_abi`: 树莓派3b使用armv7hf，RK3399使用armv8
+  
+#### 编译`full publish`动态库
+
+##### Android
+```shell
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --build_extra=OFF \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  --build_extra=OFF \
+  full_publish
+```
+##### ARMLinux
+```shell
+./lite/tools/build.sh \
+  --arm_os=armlinux \
+  --arm_abi=armv7hf \
+  --arm_lang=gcc \
+  --build_extra=OFF \
+  full_publish
+```
+- `--arm_abi`: 树莓派3b使用armv7hf，RK3399使用armv8
+  
+### 编译结果说明
+
+**编译最终产物位置**在 `build.lite.xxx.xxx.xxx` 下的 `inference_lite_lib.xxx.xxx` ，如 Android 下 ARMv8 的产物位于`inference_lite_lib.android.armv8`：
+
+![](https://user-images.githubusercontent.com/45189361/65375706-204e8780-dccb-11e9-9816-ab4563ce0963.png)
+
+**目录内容**（可能）如下：
+
+**Full_publish编译结果:**
+
+![](https://user-images.githubusercontent.com/45189361/65375704-19c01000-dccb-11e9-9650-6856c7a5bf82.png)
+
+**Tiny_publish结果:**
+
+![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png)
+
+**IOS编译结果:**
+
+![](https://user-images.githubusercontent.com/45189361/65375726-3bb99280-dccb-11e9-9903-8ce255371905.png)
+
+
+
+**具体内容**说明：
+
+1、 `bin`文件夹：可执行工具文件 `paddle_code_generator`、`test_model_bin`
+
+2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+
+- `include`  : 头文件
+- `lib` : 库文件
+  - 打包的静态库文件：
+    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
+    - `libpaddle_api_light_bundled.a` ：只包含 light_api 功能的静态库
+  - 打包的动态态库文件：
+    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
+    - `libpaddle_light_api_shared.so`：只包含 light_api 功能的动态库
+
+3、 `demo`文件夹：示例 demo ，包含 C++ demo 和  Java demo。
+
+- `cxx`   ： C++示例 demo
+  - `mobile_full` :  full_api 的使用示例
+  - `mobile_light` : light_api的使用示例
+- `java`  ：Java 示例 demo
+  - `android`  : Java的 Android 示例
+
+4、 `java` 文件夹：包含 Jni 的动态库文件与相应的 Jar 包
+
+- `jar` :  `PaddlePredictor.jar`
+- `so`  : Jni动态链接库  `libpaddle_lite_jni.so`
+
+5、 `third_party` 文件夹：第三方库文件`gflags`
+
+**注意：**
+
+1、 只有当`--arm_os=android` 时才会编译出：
+
+- Java库文件与示例：`Java`和`demo/java`
+
+- 动态库文件:`libpaddle_full_api_shared.so`,`libpaddle_light_api_shared.so`
+
+2、 `tiny_publish`编译结果不包括 C++ demo和 C++ 静态库，但提供 C++ 的 light_api 动态库、 Jni 动态库和Java demo
+
+### 加速第三方依赖库的下载
+
+移动端相关编译所需的第三方库均位于 `<PaddleLite>/third-party` 目录下，默认编译过程中，会利用`git submodule update --init --recursive`链上相关的第三方依赖的仓库。
+
+为加速`full_publish`、`test`编译模式中对`protobuf`等第三方依赖的下载，`build.sh` 和 `ci_build.sh`支持了从国内 CDN 下载第三方依赖的压缩包。
+
+使用方法：`git clone`完`Paddle-Lite`仓库代码后，手动删除本地仓库根目录下的`third-party`目录：
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+git checkout <release-version-tag>
+cd Paddle-Lite
+rm -rf third-party
+```
+
+之后再根据本文档，进行后续编译时，便会忽略第三方依赖对应的`submodule`，改为下载第三方压缩包。
diff --git a/docs/introduction/support_hardware.md b/docs/introduction/support_hardware.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4f76577bc9a0b80b188aedfc2c5cf33f786033a
--- /dev/null
+++ b/docs/introduction/support_hardware.md
@@ -0,0 +1,32 @@
+
+# 支持硬件列表
+
+
+## ARM CPU
+Paddle Lite支持[ARM Cortex-A系列处理器](https://en.wikipedia.org/wiki/ARM_Cortex-A)，支持列表如下:
+### 32bit(ARMv7a)
+- Cortex-A5
+- Cortex-A7
+- Cortex-A8
+- Cortex-A9
+- Cortex-A12
+- Cortex-A15
+- Cortex-A17(RK3288)
+- Cortex-A32
+### 64bit(ARMv7a, ARMv8a)
+- Cortex-A35
+- Cortex-A53(树莓派3)
+- Cortex-A55
+- Cortex-A57(Nvidia tx1，Nvidia tx2， 高通810等)
+- Cortex-A72(麒麟95X，高通820, RK3399，树莓派4等)
+- Cortex-A73(麒麟960，麒麟970，高通835, 联发科X30等)
+- Cortex-A75(高通845等)
+- Cortex-A76(麒麟980，麒麟990，高通855，高通730，联发科G90等）
+- Cortex-A77
+- ARMv8-A compatible(Apple A系列处理器, Nvidia tegra, Qualcomm Kryo, Falkor, Samsung Mongoose)
+
+## 移动端GPU
+Paddle Lite支持移动端GPU和Nvidia端上GPU设备，支持列表如下：
+- ARM Mali G 系列
+- Qualcomm Adreno 系列
+- Nvida tegra系列: tx1, tx2, nano, xavier
diff --git a/docs/user_guides/cpp_demo.md b/docs/user_guides/cpp_demo.md
new file mode 100644
index 0000000000000000000000000000000000000000..a915a3f05ef133988db10a77584b565352a1a8f6
--- /dev/null
+++ b/docs/user_guides/cpp_demo.md
@@ -0,0 +1,343 @@
+# C++ Demo
+
+## 编译
+
+首先按照[PaddleLite 源码编译](https://github.com/PaddlePaddle/Paddle-Lite/wiki/source_compile)准备交叉编译环境，之后拉取最新[PaddleLite release发布版代码](https://github.com/PaddlePaddle/Paddle-Lite)。下面以Android-ARMv8架构为例，介绍编译过程，并最终在手机上跑通MobilNetv1模型。
+
+进入 Paddle-Lite 目录，运行以下命令编译代码（**需加编译选项`--build_extra=ON`确保完整编译**）：
+
+```
+./lite/tools/build.sh        \
+    --arm_os=android         \
+    --arm_abi=armv8          \
+    --arm_lang=gcc           \
+    --android_stl=c++_static \
+    --build_extra=ON         \
+    full_publish
+```
+
+编译完成后 `./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/` 文件夹下包含：
+
+- cxx
+	- include (头文件文件夹)
+	- lib          (库文件文件夹)
+		- libpaddle_api_full_bundled.a
+		- libpaddle_api_light_bundled.a
+		- libpaddle_light_api_shared.so
+		- libpaddle_full_api_shared.so
+- demo
+	- cxx  （C++ demo）
+		- mobile_light  (light api demo)
+		- mobile_full    (full api demo)
+    - mobile_detection    (detection model api demo)
+    - mobile_classify    (classify model api demo)
+		- Makefile.def
+		- include
+- third_party  （第三方库文件夹）
+	- gflags
+
+## 准备执行环境
+
+执行环境有两种：使用安卓手机；若没安卓手机，也可在安卓模拟器中执行。
+
+### 环境一：使用安卓手机
+
+将手机连上电脑，在手机上打开选项 -> 开启-开发者模式 -> 开启-USB调试模式。确保 `adb devices` 能够看到相应的设备。
+
+### 环境二：使用安卓模拟器
+
+运行下面命令，分别创建安卓armv8、armv7架构的模拟器。若需在真机测试，将模拟器换成相应架构的真机环境即可。
+
+```
+*android-armv8*
+adb kill-server
+adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
+echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 &
+sleep 1m
+```
+
+```
+*android-armv7*
+adb kill-server
+adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
+echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 &
+sleep 1m
+```
+
+## 下载模型并运行示例
+
+```
+cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+
+make
+
+adb push mobilenet_v1 /data/local/tmp/
+adb push mobilenetv1_full_api /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_full_api
+adb shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
+```
+
+注：我们也提供了轻量级 API 的 demo、图像分类demo和目标检测demo，支持图像输入；
+
+### Light API Demo
+
+```
+cd ../mobile_light
+make
+adb push mobilenetv1_light_api /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt  "
+```
+
+
+### 图像分类 Demo
+
+```
+cd ../mobile_classify
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+make
+adb push mobile_classify /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push labels.txt /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobile_classify
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1.opt /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+
+### 目标检测 Demo
+
+```
+cd ../mobile_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
+tar zxvf mobilenetv1-ssd.tar.gz
+make
+adb push mobile_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobile_detection
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
+adb pull /data/local/tmp/test_detection_result.jpg ./
+```
+
+## Demo 程序运行结果
+
+### light API Demo 运行结果
+
+运行成功后 ，将在控制台输出预测结果的前10个类别的预测概率：
+
+```
+Output dim: 1000
+Output[0]: 0.000191
+Output[100]: 0.000160
+Output[200]: 0.000264
+Output[300]: 0.000211
+Output[400]: 0.001032
+Output[500]: 0.000110
+Output[600]: 0.004829
+Output[700]: 0.001845
+Output[800]: 0.000202
+Output[900]: 0.000586
+```
+
+### 图像分类 Demo 运行结果
+
+运行成功后 ，将在控制台输出预测结果的前5个类别的类型索引、名字和预测概率：
+
+```
+parameter:  model_dir, image_path and label_file are necessary
+parameter:  topk, input_width,  input_height, are optional
+i: 0, index: 285, name:  Egyptian cat, score: 0.482870
+i: 1, index: 281, name:  tabby, tabby cat, score: 0.471593
+i: 2, index: 282, name:  tiger cat, score: 0.039779
+i: 3, index: 287, name:  lynx, catamount, score: 0.002430
+i: 4, index: 722, name:  ping-pong ball, score: 0.000508
+```
+
+### 目标检测 Demo 运行结果
+
+运行成功后 ，将在控制台输出检测目标的类型、预测概率和坐标：
+
+```
+running result:
+detection image size: 935, 1241, detect object: person, score: 0.996098, location: x=187, y=43, width=540, height=592
+detection image size: 935, 1241, detect object: person, score: 0.935293, location: x=123, y=639, width=579, height=597
+```
+
+## 如何在代码中使用 API
+
+在C++中使用PaddleLite API非常简单，不需要添加太多额外代码，具体步骤如下：
+
+- 加入头文件引用
+
+```
+  #include <iostream>
+  #include <vector>
+  #include "paddle_api.h"
+  #include "paddle_use_kernels.h"
+  #include "paddle_use_ops.h"
+  #include "paddle_use_passes.h"
+```
+
+- 通过MobileConfig设置：模型文件位置（model_dir）、线程数（thread）和能耗模式( power mode )。输入数据（input），从 MobileConfig 创建 PaddlePredictor 并执行预测。  （注：Lite还支持从memory直接加载模型，可以通过MobileConfig::set_model_buffer方法实现）
+
+代码示例：
+
+```
+// 1. Create MobileConfig
+MobileConfig config;
+
+// 2. Load model
+config.set_model_dir("path to your model directory"); // model dir
+/*load model: Lite supports loading model from file or from memory (naive buffer from optimized model)
+//Method One: Load model from memory:
+void set_model_buffer(const char* model_buffer,
+                    size_t model_buffer_size,
+                    const char* param_buffer,
+                    size_t param_buffer_size)
+//Method Two: Load model from file:
+void set_model_dir(const std::string& model_dir)  */
+
+// 3. Set MobileConfig (or you can skip this step to use default value):
+config.set_power_mode(LITE_POWER_HIGH); // power mode
+/*power modes: Lite supports the following power modes
+    LITE_POWER_HIGH
+    LITE_POWER_LOW
+    LITE_POWER_FULL
+    LITE_POWER_NO_BIND
+    LITE_POWER_RAND_HIGH
+    LITE_POWER_RAND_LOW */
+config.set_threads("num of threads"); // threads
+
+// 4. Create PaddlePredictor by MobileConfig
+std::shared_ptr<PaddlePredictor> predictor =
+    CreatePaddlePredictor<MobileConfig>(config);
+
+// 5. Prepare input data
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+input_tensor->Resize({1, 3, 224, 224});
+auto *data = input_tensor -> mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 6. Run predictor
+predictor->Run();
+
+// 7. Get output
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+```
+
+## CxxConfig案例: OCR_model的运行
+
+1. OCR 模型文件：
+   - 我们提供Pb格式的[ocr_attention_mode](https://paddle-inference-dist.cdn.bcebos.com/ocr_attention.tar.gz)l下载
+   - 也可以从[Paddle/model项目](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/ocr_recognition)中训练出模型
+2. 示例代码：
+
+
+```
+#include "paddle_api.h"         // NOLINT
+#include "paddle_use_passes.h"  // NOLINT
+#include <gflags/gflags.h>
+#include <stdio.h>
+#include <vector>
+using namespace paddle::lite_api; // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
+
+int64_t ShapeProduction(const shape_t &shape) {
+  int64_t res = 1;
+  for (auto i : shape)
+    res *= i;
+  return res;
+}
+
+void RunModel() {
+  // 1. Set CxxConfig
+  CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  std::vector<Place> valid_places({Place{TARGET(kARM), PRECISION(kFloat)}});
+  if (FLAGS_prefer_int8_kernel) {
+    valid_places.insert(valid_places.begin(),
+                        Place{TARGET(kARM), PRECISION(kInt8)});
+  }
+  config.set_valid_places(valid_places);
+
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  // input 0
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1, 1, 48, 512}));
+  auto *data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+  // input1
+  std::unique_ptr<Tensor> init_ids(std::move(predictor->GetInput(1)));
+  init_ids->Resize(shape_t({1, 1}));
+  auto *data_ids = init_ids->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(init_ids->shape()); ++i) {
+    data_ids[i] = 0;
+  }
+
+  lod_t lod_i;
+  lod_i.push_back({0, 1});
+  lod_i.push_back({0, 1});
+  init_ids->SetLoD(lod_i);
+  // input2
+  std::unique_ptr<Tensor> init_scores(std::move(predictor->GetInput(2)));
+  init_scores->Resize(shape_t({1, 1}));
+  auto *data_scores = init_scores->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(init_scores->shape()); ++i) {
+    data_scores[i] = 0;
+  }
+  lod_t lod_s;
+  lod_s.push_back({0, 1});
+  lod_s.push_back({0, 1});
+  init_scores->SetLoD(lod_s);
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i++) {
+    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+  }
+}
+
+int main(int argc, char **argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
+```
+
+3. 运行方法：
+ 参考以上代码编译出可执行文件`OCR_DEMO`，模型文件夹为`ocr_attention`。手机以USB调试、文件传输模式连接电脑。
+```
+简单编译出`OCR_DEMO`的方法：用以上示例代码替换编译结果中`build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_full/mobilenetv1_full_api.cc`文件的内容，终端进入该路径（`demo/cxx/mobile_full/`），终端中执行`make && mv mobilenetv1_full_api OCR_DEMO`即编译出了OCR模型的可执行文件`OCR_DEMO`
+```
+   在终端中输入以下命令执行OCR model测试：
+
+```
+#OCR_DEMO为编译出的可执行文件名称；ocr_attention为ocr_attention模型的文件夹名称；libpaddle_full_api_shared.so是编译出的动态库文件，位于`build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib`
+adb push OCR_DEMO /data/local/tmp
+adb push ocr_attention /data/local/tmp
+adb push libpaddle_full_api_shared.so /data/local/tmp/
+adb shell 'export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && cd /data/local/tmp && ./OCR_DEMO --model_dir=./OCR_DEMO'
+```
+
+4. 运行结果
+
+<img src='https://user-images.githubusercontent.com/45189361/64398400-46531580-d097-11e9-9f1c-5aba1dfbc24f.png' align='left' width="150" height="200"/>
diff --git a/docs/user_guides/fpga.md b/docs/user_guides/fpga.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7c398af2036cab7d914a692ce4f8fdbae13d45c
--- /dev/null
+++ b/docs/user_guides/fpga.md
@@ -0,0 +1,106 @@
+# Lite基于FPGA的模型预测
+
+Paddle Lite支持基于arm的FPGA zu3/zu5/zu9的模型预测，提供armv8的交叉编译
+
+Lite基于FPGA运行模型需要相应的FPGA驱动，目前只支持百度[Edgeboard开发板](https://ai.baidu.com/tech/hardware/deepkit)
+
+## Lite实现FPGA简介
+
+Lite支持FPGA作为后端硬件进行模型推理，其主要特性如下：
+
+- Lite中FPGA的kernel（feed、fetch除外）均以FP16、NHWC的格式作为输入输出格式，所有的weights和bias仍为FP32、NCHW的格式，feed的输入和fetch的输出均为FP32、NCHW格式的数据，在提升计算速度的同时能做到用户对数据格式无感知
+
+- 对于FPGA暂不支持的kernel，均会切回arm端运行，实现arm+FPGA混合布署运行
+
+- 目前FPGA成本功耗都较低，Lite基于FPGA的模型性能远远好于arm端，可作为边缘设备首选硬件
+
+## 编译
+
+需要提前准备带有FPGAdrv.ko的FPGA开发板（如edgeboard开发板）和Lite代码
+
+CMAKE编译选项：
+
+- 设置`LITE_WITH_FPGA=ON`和`LITE_WITH_ARM=ON`
+
+其他编译选项与ARM编译相同，可以参考[“Paddle Lite在Docker下的ARM编译”](../source_compile)。
+示例如下：
+```shell
+    cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DLITE_WITH_OPENMP=ON   \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+        -DWITH_TESTING=OFF \
+        -DLITE_WITH_FPGA=ON \
+        -DARM_TARGET_OS=armlinux 
+    make publish_inference -j2
+```
+Lite提供FPGA编译脚本，位于lite/tools/build_FPGA.sh，在Lite根目录执行该脚本即可编译
+
+## 运行示例
+
+- **运行文件准备**
+
+下面以Resnet50模型为例，介绍如何使用edgeboard开发板实现模型运行
+
+```bash
+#连接开发板，并利用screen命令启动 [本机执行]
+screen /dev/cu.SLAB_USBtoUART 115200
+#查看开发板ip并ssh登录到开发板，假设开发板ip为192.0.1.1 [本机执行]
+ssh root@192.0.1.1
+
+#在开发板上建立目录workspace，拷贝FPGA驱动FPGAdrv.ko到workspace目录 [开发板执行]
+mkdir workspace && scp $DRIVER_PATH/FPGAdrv.ko workspace
+
+#将Lite中编译好的测试程序拷贝到开发板workspace目录 [本机执行]
+scp $LITE_ROOT/build_FPGA/lite/api/test_resnet50_FPGA root@$EDGEBOARD_IP:workspace/
+#把Resnet50的模型和参数scp到开发板workspace目录 [本机执行]
+scp -r $LITE_ROOT/build_FPGA/lite/third_party/install/resnet50/ root@$EDGEBOARD_IP:workspace/
+
+#在运行模型前需要加载FPGA驱动 [开发板执行]
+insmod FPGAdrv.ko
+#给测试程序添加可运行权限 [开发板执行]
+chmod +x test_resnet50_FPGA
+```
+
+- **使用FPGA进行模型预测**
+
+```bash
+#以下命令均在开发板上运行
+#直接运行单测程序
+./test_resnet50_FPGA --model_dir=resnet50
+#如果需要测试性能，可以用repeats参数设置模型运行次数（如1000），同时可以设置预热次数（如10）来让硬件事先运行到稳定水平
+./test_resnet50_FPGA --model_dir=resnet50 --repeats=1000 --warmup=10
+```
+
+## 如何在Code中使用
+
+在Lite中使用FPGA与ARM相似，具体的区别如下：
+
+- 由于fpga运行模式为fp16精度、nhwc布局，所以需要修改相应的`valid_place`
+- fpga不需要device的初始化和运行模式设置
+
+代码示例：
+```cpp
+lite::Predictor predictor;
+std::vector<Place> valid_places(
+      {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},Place{TARGET(kARM)});
+
+predictor.Build(model_dir, "", "", valid_places);
+
+auto* input_tensor = predictor.GetInput(0);
+input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+auto* data = input_tensor->mutable_data<float>();
+auto item_size = input_tensor->dims().production();
+//假设设置输入数据全为1
+for (int i = 0; i < item_size; i++) {
+  data[i] = 1;
+}
+
+predictor.Run();
+auto* out = predictor.GetOutput(0);
+```
diff --git a/docs/user_guides/index.rst b/docs/user_guides/index.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/docs/user_guides/java_demo.md b/docs/user_guides/java_demo.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a09826cd45f6ae1b8c46331d54d2f61af32fb14
--- /dev/null
+++ b/docs/user_guides/java_demo.md
@@ -0,0 +1,99 @@
+# Java Demo
+
+本节中，Java demo 完整代码位于 [demo/java](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite/demo/java) 。
+
+要编译和跑起Android demo 程序 PaddlePredictor，你需要准备：
+
+1. 一台能运行安卓程序的安卓手机
+2. 一台带有AndroidStudio的开发机
+
+## 编译
+
+首先在PaddleLite的开发 [Docker镜像](../source_compile) 中，拉取最新PaddleLite代码，编译对应你手机架构的预测库，
+下面我们以arm8 架构举例。进入paddlelite 目录，运行以下命令：
+
+```shell
+./lite/tools/build.sh        \
+    --arm_os=android         \
+    --arm_abi=armv8          \
+    --arm_lang=gcc           \
+    --android_stl=c++_static \
+    tiny_publish
+```
+
+命令完成后查看要存在
+
+```
+./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/so/libpaddle_lite_jni.so
+./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/jar/PaddlePredictor.jar
+./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android
+```
+
+libpaddle_lite_jni.so为 PaddleLite c++ 动态链接库，PaddlePredictor.jar为 Java jar 包，两者包含 PaddleLite Java API，接下来 Android Java 代码会使用这些api。android文件夹中则是Android demo。
+
+## 准备 demo 需要的其他文件
+
+Demo 除了代码，还需要准备在Android工程目录下配置好JNI .so 库（上节提到的`libpaddle_lite_jni.so`），Java .jar 包（上文提到的`PaddlePredictor.jar` ），和模型文件。我们提供了自动化的脚本和手动拷贝两种方法，用户可以根据自己需要选择：
+
+### 脚本方法
+
+进入 `build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android`，我们准备了一个脚本`prepare_demo.bash`，脚本输入一个参数，为你要拷贝的.so 对应的架构文件夹名。
+
+例如运行
+
+```
+bash prepare_demo.bash arm8
+```
+
+该脚本自动下载并解压缩模型文件，拷贝了 .jar 包进demo，还有生成的.so包进`PaddlePredictor/app/src/main/jinLibs/架构文件夹下`，
+在我们这个例子里，armv8 就是架构文件夹。备注：这种方式构建的 demo 在 armv8 手机运行正常。如果要demo 程序在别的手机架构（如 armv7）上也运行正常，需要添加别的架构。
+
+### 手动拷贝方法
+
+接下来我们介绍手动拷贝，如果使用了脚本，那么可以跳过以下手动方法的介绍。
+
+### 把 .so 动态库和 .jar 拷贝进安卓demo程序：
+
+1. 将PaddlePredictor 载入到AndroidStudio。
+2. 将`libpaddle_lite_jni.so`拷贝进 `PaddlePredictor/app/src/main/jinLibs/架构文件夹下` ，比如文件夹arm8里要包含该 .so文件。
+3. 将 `PaddlePredictor.jar` 拷贝进 `PaddlePredictor/app/libs` 下
+
+### 把demo使用到的模型文件拷贝进安卓程序：
+
+下载我们的5个模型文件，并解压缩到 `PaddlePredictor/app/src/main/assets` 这个文件夹中
+需要拷贝的模型文件和下载地址：
+
+```
+inception_v4_simple_opt.nb http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple_opt.nb.tar.gz
+lite_naive_model_opt.nb    http://paddle-inference-dist.bj.bcebos.com/lite_naive_model_opt.nb.tar.gz
+mobilenet_v1_opt.nb        http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1_opt.nb.tar.gz
+mobilenet_v2_relu_opt.nb   http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu_opt.nb.tar.gz
+resnet50_opt.nb            http://paddle-inference-dist.bj.bcebos.com/resnet50_opt.nb.tar.gz
+```
+
+下载完后，assets文件夹里要包含解压后的上面五个模型文件夹，但demo里不需要保存原压缩.tar.gz 文件。
+
+注意：输入的模型要求为naive buffer存储格式，您可以通过 [**Model Optimize Tool**](../model_optimize_tool) 将fluid模型转为naive buffer存储格式。
+
+## 运行 Android 程序结果
+
+以上准备工作完成，就可以开始Build 、安装、和运行安卓demo程序。当你运行PaddlePredictor 程序时，大概会等10秒，然后看到类似以下字样：
+
+```
+lite_naive_model output: 50.213173, -28.872887
+expected: 50.2132, -28.8729
+
+inception_v4_simple test:true
+time: xxx ms
+
+resnet50 test:true
+time: xxx ms
+
+mobilenet_v1 test:true
+time: xxx ms
+
+mobilenet_v2 test:true
+time: xxx ms
+```
+
+该 demo 程序跑我们的 5 个模型，第一个模型结果将真正的头两个数字输出，并在第二行附上期望的正确值。你应该要看到他们的误差小于0.001。后面四个模型如果你看到 `test:true` 字样，说明模型输出通过了我们在 demo 程序里对其输出的测试。time 代表该测试花费的时间。
diff --git a/docs/user_guides/library_tailoring.md b/docs/user_guides/library_tailoring.md
new file mode 100644
index 0000000000000000000000000000000000000000..5ba12cf819945ab2f182f672a2c96123bc12e070
--- /dev/null
+++ b/docs/user_guides/library_tailoring.md
@@ -0,0 +1,185 @@
+
+# 裁剪预测库方法
+
+Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编译会将所有已注册的operator打包到预测库中，造成库文件体积膨胀；**裁剪预测库**能针对具体的模型，只打包优化后该模型需要的operator，有效降低预测库文件大小。
+
+## 效果展示(Tiny_publish Android动态预测库体积)
+
+| 测试模型 | 裁剪开关  | **libpaddle_lite_jni.so** |转化后模型中的OP|
+| ------------------ | ---------------------------- | -------- |------------------|
+| mobilenetv1（armv8） | 裁剪前--build_tailor=OFF | 1.5M                | feed,etch,conv2d,depthwise_conv2d,fc,fpool2d,softmax     |
+| mobilenetv1（armv8） | 裁剪后--build_tailor=ON  |  788K              |feed,etch,conv2d,depthwise_conv2d,fc,fpool2d,softmax|
+| mobilenetv2（armv8） | 裁剪前--build_tailor=OFF  | 1.5M                | feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax |
+| mobilenetv2（armv8） | 裁剪后--build_tailor=ON  |  912K          |feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax|
+| mobilenetv1（armv7） | 裁剪前--build_tailor=OFF    | 938K     |feed,fetch,concat,conv2d,dropout,fc,pool2d,softmax|
+| mobilenetv1（armv7） | 裁剪后--build_tailor=ON  | 607K   |feed,fetch,concat,conv2d,dropout,fc,pool2d,softmax|
+| mobilenetv2（armv7） | 裁剪前--build_tailor=OFF     | 938K | feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax |
+| mobilenetv2（armv7） | 裁剪后--build_tailor=ON  |687K          |feed,fetch,conv2d,depthwise_conv2d,elementwise_add,fc,pool2d,relu6,softmax|
+
+
+
+
+## 实现过程：
+
+
+### 1、转化模型时记录优化后模型信息
+
+说明：使用model_optimize_tool转化模型时，选择 `--record_tailoring_info =true`  会将优化后模型的OP和kernel信息保存到输出文件夹，这些信息将用于编译裁剪后的动态库。
+注意：需要使用Paddle-Lite 最新版本（release/v2.0.0之后）代码编译出的model_optimize_tool
+例如：
+
+```bash
+./model_optimize_tool     --model_dir=./mobilenet_v1     --optimize_out_type=naive_buffer     --optimize_out=mobilenet_v1NB     --record_tailoring_info =true     --valid_targets=arm
+```
+效果：优化后模型使用的OP和kernel信息被保存在 `mobilenet_v1NB`文件夹中的隐藏文件里了
+
+### 2、根据模型信息编译裁剪后的预测库
+
+说明：编译Paddle-Lite时选择`--build_tailor=ON` ，并且用   `–-opt_model_dir=`   指定优化后的模型的地址
+例如：
+
+```bash
+./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish
+```
+**注意**：上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径
+
+**效果**：编译出来的动态库文件变小，且可以运行优化后的模型。
+
+编译出的C++预测库文件位于  ：
+
+`build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/`
+
+编译出的Java预测库文件位于：
+
+`build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/java/so/`
+
+### 3、运行裁剪后的预测库文件
+
+注意：基于某一模型裁剪出的预测库只能支持优化工具转化后的该模型，例如根据mobilenetV1裁剪出的 full_api预测库只能运行以protobuf格式转化出的模型mobilenetV1_opt_nb， 裁剪出的light_api预测库只能运行以naive_buffer格式转化出的模型mobilenetV1_opt_nb， 运行其他模型可能会出现`segementation fault:undifined op or kernel`。  模型转化方法参考：[使用opt转化模型](./model_optimize_tool))。
+
+
+
+**示例1**：使用裁剪后的light_api预测库运行mobilenetv1
+
+1、执行第二步编译后，light_api的C++ 示例位于
+
+`/Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/demo/cxx/mobile_light`
+
+输入`make`命令执行编译可编译出可执行文件mobilenetv1_light_api
+
+2、使用adb将mobilenetV1_NB模型和mobilenetv1_light_api传到手机后执行demo：
+
+`./mobilenetv1_light_api --model_dir=./mobilenetV1_NB`
+
+注意：`mobilenetV1_NB`是用`mobilenetV1`模型转化的naive_buffer格式模型(不需要设置` --record_tailoring_info =true`，转化流程参考：[使用opt转化模型](./model_optimize_tool))。
+
+
+
+**示例2**：使用裁剪后的full_api预测库运行mobilenetv1
+
+1、执行第二步编译后，full_api的C++ 示例位于
+
+`/Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/demo/cxx/mobile_light`
+
+替换mobilenetv1_full_api.cc代码内容：
+
+```C++
+#include <gflags/gflags.h>
+#include <stdio.h>
+#include <vector>
+#include "paddle_api.h"          // NOLINT
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_use_passes.h"   // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+void RunModel() {
+  // 1. Set CxxConfig
+  CxxConfig config;
+  config.set_model_file(FLAGS_model_dir + "model");
+  config.set_param_file(FLAGS_model_dir + "params");
+
+  std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
+  config.set_valid_places(valid_places);
+
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1, 3, 224, 224}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  printf("Output dim: %d\n", output_tensor->shape()[1]);
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+  }
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
+
+```
+
+2、使用adb将mobilenetV1_PB模型和mobilenetv1_full_api传到手机后执行demo：
+
+`./mobilenetv1_full_api --model_dir=./mobilenetV1_PB`
+
+注意：`mobilenetV1_PB`是用`mobilenetV1`模型转化的protobuf格式模型(不需要设置` --record_tailoring_info =true`，转化流程参考：[使用opt转化模型](./model_optimize_tool))。
+
+## 按模型集合裁剪预测库
+
+为了方便用户使用，我们同时提供了按模型集合进行预测库裁剪的功能。用户可以提供一个模型集合，Model Optimize Tool会根据用户所指定的模型集合分析其**优化后的**模型所需要的算子信息对预测库进行裁剪。使用此功能用户根据自己的需要使用模型集合来对预测库中的算子进行任意裁剪。
+
+使用方法如下所示：
+
+```shell
+# 非combined模型集合
+./model_optimize_tool                     \
+    --model_set_dir=<your_model_set_dir>  \
+    --optimize_out_type=naive_buffer      \
+    --optimize_out=<output_model_set_dir> \
+    --record_tailoring_info=true          \
+    --valid_targets=arm
+   
+# combined模型集合
+./model_optimize_tool                       \
+    --model_set_dir=<your_model_set_dir>    \
+    --optimize_out_type=naive_buffer        \
+    --model_filename=<model_topo_filename>  \
+    --param_filename=<model_param_filename> \
+    --optimize_out=<output_model_set_dir>   \
+    --record_tailoring_info=true            \
+    --valid_targets=arm
+```
+
+经过以上步骤后会在`<output_model_set_dir>`中生成模型集合中各模型对应的NaiveBuffer格式的优化模型。此步会对模型集合中所需算子信息进行搜集并存储到`<output_model_set_dir>`中。下一步编译预测库的流程与使用单模型进行预测库裁剪步骤相同。
+
+**注意：**
+
+1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
+2. 使用非combined参数模型时，模型拓扑文件名应为`__model__`，使用非combined参数模型时，集合中各模型的拓扑与参数名应相同，分别由`--model_filename`和`--param_filename`指定。
+3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
+4. 需要使用Paddle-Lite 最新版本（release/v2.1.0之后）代码编译出的model_optimize_tool。
diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md
new file mode 100644
index 0000000000000000000000000000000000000000..fccc6d8b23c78474257d11399d121816f57fc422
--- /dev/null
+++ b/docs/user_guides/model_optimize_tool.md
@@ -0,0 +1,161 @@
+
+# 模型转化方法
+
+Lite架构在预测过程中表现出来的高性能得益于其丰富的优化组件，其中包括量化、子图融合、混合调度、Kernel优选等等策略。为了使优化过程更加方便易用，我们提供了**opt**来自动完成优化步骤，输出一个轻量的、最优的可执行模型。具体使用方法介绍如下：
+
+**注意**：release/v2.2.0之前的模型转化工具名称为`model_optimize_tool`，从release/v2.3开始模型转化工具名称修改为`opt`
+
+## 准备opt
+当前获得opt方法有三种：
+
+1. 我们提供当前develop分支编译结果下载：[opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt)、[opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac)
+release/v2.2.0之前版本的model_optimize_tool: [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool)、[model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac)
+
+2. 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择release版本下载对应的转化工具`opt`    
+   (release/v2.2.0之前的转化工具为model_optimize_tool、release/v2.3.0之后为opt)
+
+3. 可以下载Paddle-Lite源码，从源码编译出opt工具
+```bash
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+git checkout <release-version-tag>
+./lite/tools/build.sh build_optimize_tool
+```
+编译结果位于`Paddle-Lite/build.opt/lite/api/opt`
+**注意**：从源码编译opt前需要先[安装Paddle-Lite的开发环境](../installation/source_compile)。
+
+## 使用opt
+
+opt是x86平台上的可执行文件，需要在PC端运行：包括Linux终端和Mac终端。
+
+### 帮助信息
+ 执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
+```bash
+ ./opt
+```
+![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png)
+
+### 功能一：转化模型为Paddle-Lite格式
+opt可以将PaddlePaddle支持的模型转化为Paddle-Lite支持的模型格式，期间执行的操作包括：将protobuf格式的模型文件转化为naive_buffer格式的模型文件，有效降低模型体积；执行“量化、子图融合、混合调度、Kernel优选”等图优化操作，提升其在Paddle-Lite上的运行速度、内存占用等性能指标。
+
+模型优化过程：
+
+（1）准备待优化的PaddlePaddle模型
+
+PaddlePaddle模型有两种保存格式：
+   Combined Param：所有参数信息保存在单个文件`params`中，模型的拓扑信息保存在`__model__`文件中。
+
+![opt_combined_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fcombined_model.png)
+
+   Seperated Param：参数信息分开保存在多个参数文件中，模型的拓扑信息保存在`__model__`文件中。
+![opt_seperated_model](https://paddlelite-data.bj.bcebos.com/doc_images%2Fseperated_model.png)
+
+(2) 终端中执行`opt`优化模型
+**使用示例**：转化`mobilenet_v1`模型
+
+```
+./opt --model_dir=./mobilenet_v1 --valid_targets=arm --optimize_out_type=naive_buffer --optimize_out=mobilenet_v1_opt
+```
+以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型，优化后的模型文件为`mobilenet_v1_opt.nb`，转化结果如下图所示：
+
+![opt_resulted_model](https://paddlelite-data.bj.bcebos.com/doc_images/2.png)
+
+
+(3) **更详尽的转化命令**总结：
+
+```shell
+./opt \
+    --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=(arm|opencl|x86|npu|xpu) \
+    --prefer_int8_kernel=(true|false) \
+    --record_tailoring_info =(true|false)
+```
+
+| 选项         | 说明 |
+| ------------------- | ------------------------------------------------------------ |
+| --model_dir         | 待优化的PaddlePaddle模型（非combined形式）的路径 |
+| --model_file        | 待优化的PaddlePaddle模型（combined形式）的网络结构文件路径。 |
+| --param_file        | 待优化的PaddlePaddle模型（combined形式）的权重文件路径。 |
+| --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
+| --optimize_out      | 优化模型的输出路径。                                         |
+| --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
+| --prefer_int8_kernel | 若待优化模型为int8量化模型（如量化训练得到的量化模型），则设置该选项为true以使用int8内核函数进行推理加速，默认为false。                          |
+| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |
+
+* 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
+* 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
+* 优化后的模型包括__model__.nb和param.nb文件。
+
+### 功能二：统计模型算子信息、判断是否支持
+
+opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支持该模型。并可以打印出当前Paddle-Lite的算子支持情况。
+
+（1）使用opt统计模型中算子信息
+
+下面命令可以打印出mobilenet_v1模型中包含的所有算子，并判断在硬件平台`valid_targets`下Paddle-Lite是否支持该模型
+
+`./opt --print_model_ops=true  --model_dir=mobilenet_v1 --valid_targets=arm`
+
+![opt_print_modelops](https://paddlelite-data.bj.bcebos.com/doc_images/3.png)
+
+（2）使用opt打印当前Paddle-Lite支持的算子信息
+
+`./opt --print_all_ops=true`
+
+以上命令可以打印出当前Paddle-Lite支持的所有算子信息，包括OP的数量和每个OP支持哪些硬件平台：
+
+![opt_print_allops](https://paddlelite-data.bj.bcebos.com/doc_images/4.png)
+
+`./opt ----print_supported_ops=true  --valid_targets=x86`
+
+以上命令可以打印出当`valid_targets=x86`时Paddle-Lite支持的所有OP：
+
+![opt_print_supportedops](https://paddlelite-data.bj.bcebos.com/doc_images/5.png)
+
+## 其他功能：合并x2paddle和opt的一键脚本
+
+**背景**：如果想用Paddle-Lite运行第三方来源（tensorflow、caffe、onnx）模型，一般需要经过两次转化。即使用x2paddle工具将第三方模型转化为PaddlePaddle格式，再使用opt将PaddlePaddle模型转化为Padde-Lite可支持格式。
+为了简化这一过程，我们提供一键脚本，将x2paddle转化和opt转化合并：
+
+**一键转化脚本**：[auto_transform.sh](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/auto_transform.sh)
+
+
+**环境要求**：使用`auto_transform.sh`脚本转化第三方模型时，需要先安装x2paddle环境，请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和其环境依赖项。
+
+**使用方法**：
+
+（1）打印帮助帮助信息：` ./auto_transform.sh`
+
+（2）转化模型方法
+
+```bash
+USAGE:
+    auto_transform.sh combines the function of x2paddle and opt, it can 
+    tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form.
+----------------------------------------
+example:
+    ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result
+----------------------------------------
+Arguments about x2paddle:
+    --framework=(tensorflow|caffe|onnx);
+    --model='model file for tensorflow or onnx';
+    --prototxt='proto file for caffe' --weight='weight file for caffe'
+ For TensorFlow:
+   --framework=tensorflow --model=tf_model.pb
+
+ For Caffe:
+   --framework=caffe --prototxt=deploy.prototxt --weight=deploy.caffemodel
+
+ For ONNX
+   --framework=onnx --model=onnx_model.onnx
+
+Arguments about opt:
+    --valid_targets=(arm|opencl|x86|npu|xpu); valid targets on Paddle-Lite.
+    --fluid_save_dir='path to outputed model after x2paddle'
+    --optimize_out='path to outputed Paddle-Lite model'
+----------------------------------------
+```
diff --git a/docs/user_guides/opencl.md b/docs/user_guides/opencl.md
new file mode 100644
index 0000000000000000000000000000000000000000..e9533af1ff6e2447a8e4d389df90cdb457f58fb2
--- /dev/null
+++ b/docs/user_guides/opencl.md
@@ -0,0 +1,242 @@
+# Lite基于OpenCL的ARM GPU预测
+
+Lite支持在Android系统上运行基于OpenCL的程序，目前支持Ubuntu环境下armv8、armv7的交叉编译。
+
+## 编译
+
+### 编译环境
+
+1. Docker 容器环境；
+2. Linux（推荐 Ubuntu 16.04）环境。
+
+详见 **源码编译指南-环境准备** 章节。
+
+### 编译选项
+
+|参数|介绍|值|
+|--------|--------|--------|
+|--arm_os|代表目标操作系统|目前仅支持且默认为`android`|
+|--arm_abi|代表体系结构类型，支持armv8和armv7|默认为`armv8`即arm64-v8a；`armv7`即armeabi-v7a|
+|--arm_lang|代表编译目标文件所使用的编译器|默认为gcc，支持 gcc和clang两种|
+
+### 编译Paddle-Lite OpenCL库范例
+
+注：以android-armv8-opencl的目标、Docker容器的编译开发环境为例，CMake3.10，android-ndk-r17c位于`/opt/`目录下。
+
+```bash
+# 假设当前位于处于Lite源码根目录下
+
+# 导入NDK_ROOT变量，注意检查您的安装目录若与本示例不同
+export NDK_ROOT=/opt/android-ndk-r17c
+
+# 删除上一次CMake自动生成的.h文件
+rm ./lite/api/paddle_use_kernels.h
+rm ./lite/api/paddle_use_ops.h
+
+# 根据指定编译参数编译
+./lite/tools/ci_build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  build_test_arm_opencl
+```
+
+编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内，这里仅罗列关键产物：
+
+- `cxx`:该目录是编译目标的C++的头文件和库文件;
+- `demo`:该目录包含了两个demo，用来调用使用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`，分别对应`mobile_full`和`mobile_light`文件夹。编译对应的demo仅需在`mobile_full`或`mobile_light`文
+  - `mobile_full`:使用cxx config，可直接加载fluid模型，若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏，详细见代码注释;
+  - `mobile_light`:使用mobile config，只能加载`model_optimize_tool`优化过的模型;
+- `opencl`:该目录存放opencl实现的相关kernel。
+
+```bash
+.
+|-- cxx
+|   |-- include
+|   |   |-- paddle_api.h
+|   |   |-- paddle_image_preprocess.h
+|   |   |-- paddle_lite_factory_helper.h
+|   |   |-- paddle_place.h
+|   |   |-- paddle_use_kernels.h
+|   |   |-- paddle_use_ops.h
+|   |   `-- paddle_use_passes.h
+|   `-- lib
+|       |-- libpaddle_api_full_bundled.a
+|       |-- libpaddle_api_light_bundled.a
+|       |-- libpaddle_full_api_shared.so
+|       `-- libpaddle_light_api_shared.so
+|-- demo
+|   `-- cxx
+|       |-- Makefile.def
+|       |-- README.md
+|       |-- include
+|       |   |-- paddle_api.h
+|       |   |-- paddle_lite_factory_helper.h
+|       |   |-- paddle_place.h
+|       |   |-- paddle_use_kernels.h
+|       |   |-- paddle_use_ops.h
+|       |   `-- paddle_use_passes.h
+|       |-- mobile_full
+|       |   |-- Makefile
+|       |   `-- mobilenetv1_full_api.cc
+|       `-- mobile_light
+|           |-- Makefile
+|           `-- mobilenetv1_light_api.cc
+`-- opencl
+    `-- cl_kernel
+        |-- buffer
+        |   |-- depthwise_conv2d_kernel.cl
+        |   |-- elementwise_add_kernel.cl
+        |   |-- fc_kernel.cl
+        |   |-- im2col_kernel.cl
+        |   |-- layout_kernel.cl
+        |   |-- mat_mul_kernel.cl
+        |   |-- pool_kernel.cl
+        |   `-- relu_kernel.cl
+        |-- cl_common.h
+        `-- image
+            |-- channel_add_kernel.cl
+            |-- elementwise_add_kernel.cl
+            |-- pool_kernel.cl
+            `-- relu_kernel.cl
+```
+
+调用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`见下一部分运行示例。
+
+
+
+## 运行示例
+
+下面以android、ARMv8、gcc的环境为例，介绍3个示例，分别如何在手机上执行基于OpenCL的ARM GPU推理过程。
+
+
+**注意：** 以下命令均在Lite源码根目录下运行。在3个示例前，下面这段命令都先要执行用来准备环境:
+
+```bash
+# 在/data/local/tmp目录下创建OpenCL文件目录
+adb shell mkdir -p /data/local/tmp/opencl
+adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/buffer
+adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/image
+
+# 将OpenCL的kernels文件推送到/data/local/tmp/opencl目录下
+adb push lite/backends/opencl/cl_kernel/cl_common.h /data/local/tmp/opencl/cl_kernel/
+adb push lite/backends/opencl/cl_kernel/buffer/* /data/local/tmp/opencl/cl_kernel/buffer/
+adb push lite/backends/opencl/cl_kernel/image/* /data/local/tmp/opencl/cl_kernel/image/
+```
+
+### 运行示例1: 编译产物demo示例
+
+```bash
+######################################################################
+# 编译mobile_full的demo                                              #
+######################################################################
+# 步骤:                                                              #
+#   0.确保编译Paddle-Lite时编译了OpenCL;                             #
+#   1.编辑`mobilenetv1_full_api.cc`代码, 开启`DEMO_USE_OPENCL`的宏;  #
+#   2.在产物目录`demo/cxx/mobile_full`下编译`mobile_full`的demo;     #
+#   3.上传demo, 模型, opencl kernel文件到手机;                       #
+#   4.运行demo得到预期结果.                                          #
+######################################################################
+adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
+chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api
+adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api /data/local/tmp/opencl/
+adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
+
+# use mobile_full run mobilenet_v1
+# `GLOG_v` is log level
+adb shell "export GLOG_v=0; \
+    /data/local/tmp/opencl/mobilenetv1_full_api \
+    --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
+    --optimized_model_dir=/data/local/tmp/opencl/full_api_opt_model"
+
+
+
+######################################################################
+# 编译mobile_light的demo                                             #
+######################################################################
+# 步骤:                                                              #
+#   0.确保编译Paddle-Lite时编译了OpenCL;                             #
+#   1.编译model_optimize_tool并对模型优化, `targets`参数为`opencl`;  #
+#   2.在产物目录`demo/cxx/mobile_light`下编译`mobile_light`的demo;   #
+#   3.上传demo, 模型, opencl kernel文件到手机;                       #
+#   4.运行demo得到预期结果.                                          #
+######################################################################
+
+# use model_optimize_tool to optimize model
+./build.model_optimize_tool/lite/api/model_optimize_tool \
+  --model_dir=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
+  --optimize_out_type=naive_buffer \
+  --optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
+  --valid_targets=opencl
+
+adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
+chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api
+adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/
+adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
+
+# use mobile_light run mobilenet_v1
+adb shell "export GLOG_v=5; \
+  /data/local/tmp/opencl/mobilenetv1_light_api \
+  --model_dir=/data/local/tmp/opencl/"
+```
+
+### 运行示例2: test_mobilenetv1单元测试
+
+- **运行文件准备**
+
+```bash
+# 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下
+adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1
+adb push build.lite.android.armv8.gcc.opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/
+
+# 将OpenCL单元测试程序test_mobilenetv1，推送到/data/local/tmp/opencl目录下
+adb push build.lite.android.armv8.gcc.opencl/lite/api/test_mobilenetv1 /data/local/tmp/opencl
+```
+
+- **执行OpenCL推理过程**
+
+使用如下命令运行OpenCL程序。其中：
+
+- `--cl_path`指定了OpenCL的kernels文件即cl\_kernel所在目录；
+- `--modle_dir`指定了模型文件所在目录。
+
+```bash
+adb shell chmod +x /data/local/tmp/opencl/test_mobilenetv1
+
+adb shell /data/local/tmp/opencl/test_mobilenetv1 \
+  --cl_path=/data/local/tmp/opencl \
+  --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
+  --warmup=1 \
+  --repeats=1
+```
+
+**注意：** 因为权重参数均会在Op Kernel第一次运行时进行加载，所以第一次的执行时间会略长。一般将warmup的值设为1，repeats值设为多次。
+
+### 运行示例3: test_layout_opencl单元测试
+
+- **运行文件准备**
+
+```bash
+# 将OpenCL单元测试程序test_layout_opencl，推送到/data/local/tmp/opencl目录下
+adb push build.lite.android.armv8.gcc.opencl/lite/kernels/opencl/test_layout_opencl /data/local/tmp/opencl/
+```
+
+
+OpenCL推理过程**
+
+```bash
+adb shell chmod +x /data/local/tmp/opencl/test_layout_opencl
+adb shell /data/local/tmp/opencl/test_layout_opencl
+```
+
+
+# 如何在Code中使用
+
+见运行示例1的demo代码:
+
+1. [./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
+2. [./lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc).
+
+注：这里给出的链接会跳转到线上最新develop分支的代码，很可能与您本地的代码存在差异，建议参考自己本地位于`lite/demo/cxx/`目录的代码，查看如何使用。
+
+**NOTE：** 对OpenCL的支持还在持续开发中。
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 9c0e8e1c343b8eb1705e871aa652e3254474391d..f6f7ec75e65ff54e3f3642822e51057d3522ae3a 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -43,6 +43,7 @@ void Predictor::SaveModel(const std::string &dir,
       LOG(FATAL) << "Unknown model type";
   }
   if (record_info) {
+    MkDirRecur(dir);
     SaveOpKernelInfo(dir);
   }
 }
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index 29d8f4f29ab822f8c9601bbd63a3626abbbf1818..d517383d2773a02f9edba46c6df0df131c746876 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -25,6 +25,8 @@ void LightPredictor::Build(const std::string& lite_model_file,
   } else {
     LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
   }
+
+  DequantizeWeight();
   BuildRuntimeProgram(cpp_program_desc_);
   PrepareFeedFetch();
 }
diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc
index 79f9bea762e099b249f597dddb7df790361edc2a..bcc9644f81542ab6fb8a0badf8ecaea89fc8dedb 100644
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -23,6 +23,10 @@
 #include "lite/core/op_registry.h"
 
 DEFINE_string(optimized_model, "", "optimized_model");
+DEFINE_int32(N, 1, "input_batch");
+DEFINE_int32(C, 3, "input_channel");
+DEFINE_int32(H, 224, "input_height");
+DEFINE_int32(W, 224, "input_width");
 
 namespace paddle {
 namespace lite {
@@ -37,7 +41,8 @@ void TestModel(const std::vector<Place>& valid_places,
   predictor.Build(model_dir, "", "", valid_places);
 
   auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W})));
   auto* data = input_tensor->mutable_data<float>();
   auto item_size = input_tensor->dims().production();
   for (int i = 0; i < item_size; i++) {
@@ -58,6 +63,8 @@ void TestModel(const std::vector<Place>& valid_places,
     predictor.SaveModel(FLAGS_optimized_model);
   }
 
+  LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " "
+            << FLAGS_H << " " << FLAGS_W;
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
             << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
@@ -123,10 +130,10 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
   std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
-      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
       Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
       TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
   });
 
diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc
index 84bd27e352f549d619cfa51f9127f973023e6d45..012d6d48d9e6d3747f83a7f1089944bbaf359f71 100644
--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -23,6 +23,10 @@
 #include "lite/core/op_registry.h"
 
 DEFINE_string(optimized_model, "", "optimized_model");
+DEFINE_int32(N, 1, "input_batch");
+DEFINE_int32(C, 3, "input_channel");
+DEFINE_int32(H, 224, "input_height");
+DEFINE_int32(W, 224, "input_width");
 
 namespace paddle {
 namespace lite {
@@ -38,7 +42,8 @@ void TestModel(const std::vector<Place>& valid_places,
   predictor.Build(model_dir, "", "", valid_places);
 
   auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W})));
   auto* data = input_tensor->mutable_data<float>();
   auto item_size = input_tensor->dims().production();
   for (int i = 0; i < item_size; i++) {
@@ -59,6 +64,8 @@ void TestModel(const std::vector<Place>& valid_places,
     predictor.SaveModel(FLAGS_optimized_model);
   }
 
+  LOG(INFO) << "input shape(NCHW):" << FLAGS_N << " " << FLAGS_C << " "
+            << FLAGS_H << " " << FLAGS_W;
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
             << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
@@ -123,8 +130,11 @@ TEST(MobileNetV2, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV2, test_opencl) {
   std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
   });
 
   TestModel(valid_places);
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index c172169e59ec074b81a07e4fc96cd0363c50a10a..92f83371e30affa017a3796cd92cdce7fecc0753 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -26,9 +26,11 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/core/op_registry.h"
+#include "lite/core/version.h"
 #include "lite/model_parser/compatible_pb.h"
 #include "lite/model_parser/pb/program_desc.h"
 #include "lite/utils/cp_logging.h"
+#include "lite/utils/io.h"
 #include "lite/utils/string.h"
 #include "supported_kernel_op_info.h"  // NOLINT
 
@@ -89,13 +91,13 @@ std::vector<Place> ParserValidPlaces() {
       valid_places.emplace_back(TARGET(kARM));
     } else if (target_repr == "opencl") {
       valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
-      valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)});
       valid_places.emplace_back(
           Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
       valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
       valid_places.emplace_back(
           TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
     } else if (target_repr == "x86") {
@@ -239,6 +241,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
 /// Print help information
 void PrintHelpInfo() {
   // at least one argument should be inputed
+  const std::string opt_version = lite::version();
   const char help_info[] =
       "At least one argument should be inputed. Valid arguments are listed "
       "below:\n"
@@ -260,7 +263,8 @@ void PrintHelpInfo() {
       "        `--print_model_ops=true  --model_dir=<model_param_dir> "
       "--valid_targets=(arm|opencl|x86|npu|xpu)`"
       "  Display operators in the input model\n";
-  std::cout << help_info << std::endl;
+  std::cout << "opt version:" << opt_version << std::endl
+            << help_info << std::endl;
   exit(1);
 }
 
@@ -397,6 +401,7 @@ void Main() {
     return;
   }
 
+  lite::MkDirRecur(FLAGS_optimize_out);
   auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true);
   if (model_dirs.size() == 0) {
     LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model";
@@ -451,7 +456,9 @@ int main(int argc, char** argv) {
   }
   google::ParseCommandLineFlags(&argc, &argv, false);
   paddle::lite_api::ParseInputCommand();
-  paddle::lite_api::CheckIfModelSupported();
+  if (FLAGS_model_set_dir == "") {
+    paddle::lite_api::CheckIfModelSupported();
+  }
   paddle::lite_api::Main();
   return 0;
 }
diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc
index 98404fe60fdb1384d390458e10dac8c967fd2b21..d17ce0dea4640899482ba9dd87d0646ca2de705d 100644
--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -922,7 +922,7 @@ void sgemv_trans(const int M,
   /* end */                                                 \
   "4:                           \n" /* end */               \
   "fmov   s1, %w[alpha]         \n" /* mov alpha to s1  */  \
-  "fcmp   s8, #0                \n" /* cmp with zero*/      \
+  "fcmp   s8, #0.0              \n" /* cmp with zero*/      \
   "bge    5f                    \n" /* if ge zero */        \
   "fmul   s8, s8, s1            \n" /* out * alpha */       \
   "5:                           \n" /* leakey relu label */ \
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
index 1e3586b7fde8d79fe49327185c623ac613cd080d..14086dcd16bd1a8770f444bdcd0b6bea78e23b7e 100755
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -142,7 +142,7 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
 #endif
 
 #ifdef RELU
-    output = activation(output);
+    output = activation_type4(output);
 #endif
 
 
@@ -309,8 +309,8 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
 #endif
 
 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
+    output[0] = activation_type4(output[0]);
+    output[1] = activation_type4(output[1]);
 #endif
 
     WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
diff --git a/lite/backends/x86/jit/more/mkl/mkl.h b/lite/backends/x86/jit/more/mkl/mkl.h
index 8b713e537e74ca2d2a2e79dad7c325cda9c0e7a4..6bc791e64575b8f481f91ea3c28ea4896fe1860d 100644
--- a/lite/backends/x86/jit/more/mkl/mkl.h
+++ b/lite/backends/x86/jit/more/mkl/mkl.h
@@ -142,14 +142,13 @@ void StrideScal(const T* a, const T* x, T* y, int n, int stride);
 // remain is the product of dimension shapes after the axis dimension
 template <typename T>
 void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
-  std::vector<T> entities(bs);
   for (int i = 0; i < bs; ++i) {
-    entities[i] = x[i * n];
+    T entity = x[i * n];
     for (int c = 1; c < n; ++c) {
-      entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i];
+      entity = x[i * n + c] > entity ? x[i * n + c] : entity;
     }
     for (int c = 0; c < n; ++c) {
-      y[i * n + c] = x[i * n + c] - entities[i];
+      y[i * n + c] = x[i * n + c] - entity;
     }
   }
   VExp(y, y, n * bs);
diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc
index f242e14ad1119e9de78df4841d47ea40d8c751af..a17807e8a997f0ecf908313a4cb205676e4fa4b8 100644
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -110,11 +110,7 @@ void set_constant(const lite::Context<Target>& context,
                   lite::Tensor* tensor,
                   float value) {
   TensorSetConstantWithTarget<Target> func(context, tensor, value);
-  // #ifdef PADDLE_WITH_CUDA
-  // tensor->target().apply_visitor(func);
-  // #else
   func();
-  // #endif
 }
 
 template <typename T>
@@ -123,7 +119,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
                   const lite::Tensor& input,
                   const lite::Tensor& vector,
                   lite::Tensor* output) {
-    auto in_dims = input.dims();
+    const auto& in_dims = input.dims();
     auto size = input.numel() / in_dims[0];
     PADDLE_ENFORCE_EQ(vector.numel(), size);
     PADDLE_ENFORCE_EQ(output->dims(), in_dims);
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index 2c761c6c2a08d24a52db41478456f8db332ef2d2..0327a50af3387588cf067c637762b625cadfc58f 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -103,12 +103,6 @@ void DeleteDynamicQuantOpFuser::InsertNewNode(SSAGraph* graph,
 
   // obtain values, save values and relink node
   int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
-  int range = ((1 << (bit_length - 1)) - 1);
-  auto* scope = quant_node->stmt()->op()->scope();
-  auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name)
-                           ->GetMutable<lite::Tensor>();
-  float scale_value = scale_tensor->data<float>()[0] / range;
-
   auto outlinks = output_act_node->outlinks;
   for (auto* quantized_node : outlinks) {
     auto* op_desc = quantized_node->stmt()->mutable_op_info();
diff --git a/lite/core/mir/kernel_place_correct_pass.h b/lite/core/mir/kernel_place_correct_pass.h
index 5fab5000862378976c16448f5a82f052ffbc20a5..35cd2e6ef7e4a82335765e89bb6f80df07e4f903 100644
--- a/lite/core/mir/kernel_place_correct_pass.h
+++ b/lite/core/mir/kernel_place_correct_pass.h
@@ -86,7 +86,6 @@ class KernelPlaceCorrectPass : public DebugPass {
                                    << node_name;
         VLOG(4) << "-- input arg_name:" << arg_name << " "
                 << "-- node name:" << node_name;
-        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
         if (!x_in->AsArg().type) {
           need_correct_place &= false;
         } else {
@@ -108,7 +107,6 @@ class KernelPlaceCorrectPass : public DebugPass {
                                    << node_name << " in Inst "
                                    << inst.op_type();
         VLOG(4) << "-- output arg_name " << arg_name;
-        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
         if (!x_out->AsArg().type) {
           need_correct_place &= false;
         } else {
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 252517939990d8ce48083badb342c22fae1459c6..247795a86ce2cbe962b161311f7845622ee3983e 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -203,7 +203,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
                                  valid_places,
                                  input_tensor_shape,
                                  input_tensor_type,
-                                 FLAGS_optimized_model_dir + "/ref_opt_model");
+                                 FLAGS_optimized_model_dir + "_ref_opt_model");
 // Generate and run optimized model on NPU/XPU as the target predictor
 #ifdef LITE_WITH_NPU
   valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
@@ -217,7 +217,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
                                  valid_places,
                                  input_tensor_shape,
                                  input_tensor_type,
-                                 FLAGS_optimized_model_dir + "/tar_opt_model");
+                                 FLAGS_optimized_model_dir + "_tar_opt_model");
   // Check the difference of the output tensors between reference predictor and
   // target predictor
   CheckOutputTensors(tar_predictor, ref_predictor, output_tensor_type);
diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md
index a0162ddfdc83a8245e8d0d8d8862f0413cac5d8e..6fb0a11c2e623f295a2c9b31ff7c3146f9fc5b98 100644
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -25,7 +25,7 @@ cd Paddle-Lite
     --android_stl=c++_static \
     --build_extra=ON \
     --shutdown_log=OFF \
-    tiny_publish
+    full_publish
 ```
 
 进入编译目录，下载模型和图片的压缩包，编译可执行文件。
@@ -40,10 +40,11 @@ make
 ```
 # 下载paddlehub以后，通过python执行以下代码
 import paddlehub as hub
-pyramidbox_lite_mask = hub.Module(name="pyramidbox_lite_mask")
+pyramidbox_lite_mobile_mask = hub.Module(name="pyramidbox_lite_mobile_mask")
 # 将模型保存在test_program文件夹之中
-pyramidbox_lite_mask.processor.save_inference_model(dirname="test_program") 
-通过以上命令，可以获得人脸检测和口罩佩戴判断模型，分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件，__param__文件是权重文件。
+pyramidbox_lite_mobile_mask.processor.save_inference_model(dirname="test_program") 
+# 通过以上命令，可以获得人脸检测和口罩佩戴判断模型，分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件，__param__文件是权重文件。
+# 从PaddleHub下载的是预测模型，需要使用PaddleLite提供的model_optimize_tools对预测模型进行转换，请参考[模型转换文档](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/model_optimize_tool/)。
 ```
 
 电脑连接安卓手机，将可执行文件、测试图片、模型文件、预测库push到安卓手机上。
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index c40e3d5e9aa1dfc88ca0fae8d14c11b2a6dcbe1d..1f7c4522f159dd080b5965fb383ab6624df3db4e 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -27,7 +27,10 @@ int64_t ShapeProduction(const shape_t& shape) {
 void RunModel(std::string model_dir) {
   // 1. Set MobileConfig
   MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir);
+  // NOTE: To load model transformed by model_optimize_tool before
+  // release/v2.3.0, plese use `set_model_dir` API as listed below.
+  // config.set_model_dir(model_dir);
 
   // 2. Create PaddlePredictor by MobileConfig
   std::shared_ptr<PaddlePredictor> predictor =
diff --git a/lite/gen_code/paddle_infer.h b/lite/gen_code/paddle_infer.h
old mode 100755
new mode 100644
index dc2d56422cd710778a36c5e85f42e701fbfcbf0f..e01ffc25e29ca94166e8fe12b0643ae9e914001d
--- a/lite/gen_code/paddle_infer.h
+++ b/lite/gen_code/paddle_infer.h
@@ -46,6 +46,7 @@ class Tensor {
  */
 class PaddlePredictor {
  public:
+  void Init();
 
   std::unique_ptr<Tensor> GetTensor(const std::string &id) const;
   std::unique_ptr<Tensor> GetMutableTensor(const std::string &id);
diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc
index 5af21af78fbbbe0425cad63e3047c330b79129b5..adbb50bf0c429a487a1993d5aedf06de56c237e6 100644
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -67,21 +67,21 @@ void LookupTableCompute::Run() {
 
 REGISTER_LITE_KERNEL(lookup_table,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::arm::LookupTableCompute,
                      def)
     .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 REGISTER_LITE_KERNEL(lookup_table_v2,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::arm::LookupTableCompute,
                      def)
     .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/lookup_table_compute.h b/lite/kernels/arm/lookup_table_compute.h
index 2b66835e71618e299a2f34513dfcb32988848d74..f0c50f55b6d358d8855c78c27f62d53904c3c29d 100644
--- a/lite/kernels/arm/lookup_table_compute.h
+++ b/lite/kernels/arm/lookup_table_compute.h
@@ -21,7 +21,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class LookupTableCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class LookupTableCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::LookupTableParam;
 
diff --git a/lite/kernels/arm/lookup_table_compute_test.cc b/lite/kernels/arm/lookup_table_compute_test.cc
index 78748edf39c43c5451f8fa3c4d63bde7405c7078..ef6b04862df666cb5c8f3695ca1823cf90c1b313 100644
--- a/lite/kernels/arm/lookup_table_compute_test.cc
+++ b/lite/kernels/arm/lookup_table_compute_test.cc
@@ -53,7 +53,7 @@ void lookup_table_compute_ref(const operators::LookupTableParam &param) {
 
 TEST(lookup_table_arm, retrieve_op) {
   auto lookup_table =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kAny)>(
           "lookup_table");
   ASSERT_FALSE(lookup_table.empty());
   ASSERT_TRUE(lookup_table.front());
@@ -61,7 +61,7 @@ TEST(lookup_table_arm, retrieve_op) {
 
 TEST(lookup_table_arm, init) {
   LookupTableCompute lookup_table;
-  ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat));
+  ASSERT_EQ(lookup_table.precision(), PRECISION(kAny));
   ASSERT_EQ(lookup_table.target(), TARGET(kARM));
 }
 
@@ -112,4 +112,4 @@ TEST(lookup_table_arm, compute) {
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(lookup_table, kARM, kAny, kNCHW, def);
diff --git a/lite/kernels/arm/write_to_array_compute.cc b/lite/kernels/arm/write_to_array_compute.cc
index a394c28a698c278dea7ded51ae016b777d2a0971..1580156e7987071439cdcbe832a07a58eb025b0d 100644
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/arm/write_to_array_compute.cc
@@ -65,6 +65,6 @@ REGISTER_LITE_KERNEL(write_to_array,
                      paddle::lite::kernels::arm::WriteToArrayCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindOutput("Out", {LiteType::GetTensorListTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc
index 50c7f9d65a94658f8670ac63e658656b982f4649..00aa4b3497dd0f9bebbfa31b0256250b30b40a30 100644
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -73,8 +73,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       auto shape =
           std::vector<int>(actual_shape_data,
                            actual_shape_data + actual_shape_dims.production());
-      auto out_dims = lite::operators::ValidateShape(shape, x_dims);
-      auto out_shape = out_dims.Vectorize();
+      auto out_shape = lite::operators::ValidateShape(shape, x_dims);
       if (out_shape.size() > 4) {
         LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                         "but Shape has "
@@ -88,8 +87,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     reshape_op->set_input_w(*actual_shape_node->data());
   } else {
     auto shape = op_info->GetAttr<std::vector<int>>("shape");
-    auto out_dims = lite::operators::ValidateShape(shape, x_dims);
-    auto out_shape = out_dims.Vectorize();
+    auto out_shape = lite::operators::ValidateShape(shape, x_dims);
     if (out_shape.size() > 4) {
       LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                       "but shape has "
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index e81fdf307e94fbb6593962052b911c34a944777a..f87b37fc62343b00aedd92fc7c30de3ea42c3c9d 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -16,7 +16,6 @@ add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${te
 add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(sigmoid_opencl OPENCL basic SRCS sigmoid_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
@@ -62,14 +61,10 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
              DEPS depthwise_conv2d_opencl op_registry program context
              ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
-lite_cc_test(test_depthwise_conv2d_basic_opencl SRCS depthwise_conv2d_basic_compute_test.cc
-             DEPS depthwise_conv2d_opencl op_registry program context
+lite_cc_test(test_depthwise_conv2d_image2d_opencl SRCS depthwise_conv2d_image2d_compute_test.cc
+             DEPS conv_opencl op_registry program context
              ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
-#lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc
-#             DEPS conv2d_1x1_opencl op_registry program context
-#             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
 lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
              DEPS reshape_opencl op_registry program context
              ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
diff --git a/lite/kernels/opencl/concat_compute.cc b/lite/kernels/opencl/concat_compute.cc
index 0f25439ed00a9ff579bbd59a543dba3c8c3b090b..c57602e39aea27250eabfcf7a0570d80d7ff3dc4 100644
--- a/lite/kernels/opencl/concat_compute.cc
+++ b/lite/kernels/opencl/concat_compute.cc
@@ -356,17 +356,17 @@ REGISTER_LITE_KERNEL(
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
-    .BindInput("AxisTensor",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kInt32),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNCHW))})
-    .Finalize();
+// REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
+//     .BindInput("X",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kFloat),
+//                                       DATALAYOUT(kNCHW))})
+//     .BindInput("AxisTensor",
+//                {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                       PRECISION(kInt32),
+//                                       DATALAYOUT(kNCHW))})
+//     .BindOutput("Out",
+//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
+//                                        PRECISION(kFloat),
+//                                        DATALAYOUT(kNCHW))})
+//     .Finalize();
diff --git a/lite/kernels/opencl/concat_compute_test.cc b/lite/kernels/opencl/concat_compute_test.cc
index 37e7b6658be2eaa60285474b3766ce462ea3779b..9af0666cc9bdef184654a026bbfb6004c2ccdd18 100644
--- a/lite/kernels/opencl/concat_compute_test.cc
+++ b/lite/kernels/opencl/concat_compute_test.cc
@@ -73,7 +73,7 @@ void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
     }
   }
 }
-#if 1  // concat_buffer
+#if 0   // concat_buffer
 TEST(opencl_concat_buffer, compute) {
   // prepare data
   const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
@@ -382,7 +382,7 @@ TEST(concat_image2d_fp32, compute) {
 }  // namespace paddle
 
 // concat buffer
-USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
 
 // concat image2d fp32
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
diff --git a/lite/kernels/opencl/conv2d_1x1_compute.cc b/lite/kernels/opencl/conv2d_1x1_compute.cc
deleted file mode 100644
index 975105fd41cd0b0224b760222a3e08a5ea4601aa..0000000000000000000000000000000000000000
--- a/lite/kernels/opencl/conv2d_1x1_compute.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/image_helper.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-#define USE_BUFFER_FOR_CONV1x1_BIAS
-class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
-                                                  DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    if (param.fuse_relu) {
-      build_options_ += " -DRELU";
-    }
-
-    const bool has_bias = param.bias != nullptr;
-    const bool is_element_wise_bias =
-        has_bias && param.output->dims() == param.bias->dims();
-    if (has_bias) {
-      build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    if (param.x->dims()[1] % 4 == 0) {
-      context.cl_context()->AddKernel(kernel_func_name_simple_,
-                                      "image/conv2d_1x1_kernel.cl",
-                                      build_options_);
-    } else {
-      context.cl_context()->AddKernel(
-          kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_);
-    }
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto input_dims = param.x->dims();
-    auto paddings = *param.paddings;
-    auto strides = param.strides;
-    auto* input_image = param.x->data<float, cl::Image2D>();
-    auto* filter_image = param.filter->data<float, cl::Image2D>();
-    auto filter_dims = param.filter->dims();
-    auto output_dims = param.output->dims();
-
-    int input_width = input_dims[3];
-    int input_height = input_dims[2];
-    int output_width = output_dims[3];
-    int output_height = output_dims[2];
-    auto out_image_shape = InitImageDimInfoWith(output_dims);
-    auto* out_image = param.output->mutable_data<float, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-
-    const bool has_bias = param.bias != nullptr;
-    const bool is_element_wise_bias =
-        has_bias && param.output->dims() == param.bias->dims();
-    int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-                 static_cast<int>(paddings[0]);
-
-    // calc input_c_block
-    auto input_image_shape = InitImageDimInfoWith(input_dims);
-    int input_c_block = input_image_shape["width"] / input_dims[3];
-    int input_c = input_dims[1];
-    auto dilations = *param.dilations;
-
-    const std::vector<size_t>& default_work_size =
-        DefaultWorkSize(output_dims,
-                        DDim(std::vector<DDim::value_type>{
-                            static_cast<int64_t>(out_image_shape["width"]),
-                            static_cast<int64_t>(out_image_shape["height"])}));
-
-    int c_block = default_work_size[0];
-    int w = default_work_size[1];
-    int nh = default_work_size[2];
-
-    VLOG(4) << "============ conv2d_1x1 params ============";
-    VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-            << input_image_shape["height"];
-    VLOG(4) << "input_c_block: " << input_c_block;
-    VLOG(4) << "input_c: " << input_c;
-    VLOG(4) << "input_image: " << input_image;
-    VLOG(4) << "filter_dims: " << filter_dims;
-    VLOG(4) << "filter_image: " << filter_image;
-    VLOG(4) << "output_dims: " << output_dims;
-    VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-            << out_image_shape["height"];
-    VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-    VLOG(4) << "has bias: " << has_bias;
-    VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-    VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-    VLOG(4) << "offset: " << offset;
-    VLOG(4) << "dilations.size : " << dilations.size();
-    VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-    VLOG(4) << "default work size{c_block, w, nh}: "
-            << "{" << c_block << ", " << w << ", " << nh << ""
-            << "}";
-
-    CHECK_GE(dilations.size(), 2);
-    CHECK(dilations[0] == dilations[1]);
-    CHECK_GE(input_dims.size(), 4);
-    CHECK_GE(paddings.size(), 2);
-    CHECK(paddings[0] == paddings[1]);
-    CHECK_GE(strides.size(), 2);
-    CHECK(strides[0] == strides[1]);
-
-    // handle bias  use buffer for channel wise , use image for element wise
-    const cl::Buffer* bias_buf = nullptr;
-    const cl::Image2D* bias_image = nullptr;
-    if (has_bias) {
-#ifndef USE_BUFFER_FOR_CONV1x1_BIAS
-      is_element_wise_bias
-          ? (bias_image = param.bias->data<float, cl::Image2D>())
-          : (bias_buf = param.bias->data<float, cl::Buffer>());
-#else
-      bias_image = param.bias->data<float, cl::Image2D>();
-#endif
-    }
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    if (input_dims[1] % 4 == 0) {
-      kernel_key << kernel_func_name_simple_ << build_options_;
-    } else {
-      kernel_key << kernel_func_name_ << build_options_;
-    }
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int maped_w = maptofactor(w, 4);
-
-    VLOG(4) << "kernel_key: " << kernel_key.str();
-    VLOG(4) << "kernel ready ... " << kernel_key.str();
-    VLOG(4) << "maped_w: " << maped_w;
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, c_block);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, maped_w);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, nh);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_image);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *filter_image);
-    CL_CHECK_FATAL(status);
-    if (has_bias) {
-#ifndef USE_BUFFER_FOR_CONV1x1_BIAS
-      if (is_element_wise_bias != 0) {
-        VLOG(4) << "set bias_image: ";
-        status = kernel.setArg(++arg_idx, *bias_image);
-      } else {
-        VLOG(4) << "set bias_buf: ";
-        status = kernel.setArg(++arg_idx, *bias_buf);
-      }
-#else
-      status = kernel.setArg(++arg_idx, *bias_image);
-#endif
-      CL_CHECK_FATAL(status);
-    }
-    status = kernel.setArg(++arg_idx, *out_image);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, strides[0]);
-    CL_CHECK_FATAL(status);
-
-    status = kernel.setArg(++arg_idx, offset);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_c_block);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_c);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, dilations[0]);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_height);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, output_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, output_height);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, w);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size =
-        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                    static_cast<size_t>(maped_w),
-                    static_cast<size_t>(default_work_size.data()[2])};
-
-    VLOG(4) << "out_image: " << out_image;
-    VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-            << global_work_size[1] << "," << global_work_size[2] << "}";
-
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_image, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"conv2d_1x1"};
-  std::string kernel_func_name_simple_{"conv2d_1x1_simple"};
-  std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(conv2d_1x1,
-                     kOpenCL,
-                     kFloat,
-                     kImageDefault,
-                     paddle::lite::kernels::opencl::Conv2d1x1Image2DCompute,
-                     image2d)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Bias",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageNW))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
diff --git a/lite/kernels/opencl/conv2d_1x1_compute_test.cc b/lite/kernels/opencl/conv2d_1x1_compute_test.cc
deleted file mode 100644
index 6879d2ba38cdb98cd1dd4df8fe2f3b3c90cc22f2..0000000000000000000000000000000000000000
--- a/lite/kernels/opencl/conv2d_1x1_compute_test.cc
+++ /dev/null
@@ -1,413 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include <random>
-
-#include "lite/backends/opencl/cl_image_converter.h"
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/utils/logging.h"
-
-namespace paddle {
-namespace lite {
-
-template <typename Dtype1, typename Dtype2>
-static void conv_basic(const Dtype1* din,
-                       Dtype2* dout,
-                       int num,
-                       int chout,
-                       int hout,
-                       int wout,
-                       int chin,
-                       int hin,
-                       int win,
-                       const Dtype1* weights,
-                       const Dtype2* bias,
-                       int group,
-                       int kernel_w,
-                       int kernel_h,
-                       int stride_w,
-                       int stride_h,
-                       int dila_w,
-                       int dila_h,
-                       int pad_w,
-                       int pad_h,
-                       bool flag_bias,
-                       bool flag_relu) {
-  Dtype2 beta = 0;
-  auto src_data = din;
-  auto dst_data_ref = dout;
-  auto weights_data = weights;
-  auto with_bias = flag_bias;
-  auto bias_data = bias;
-
-  int in_num = num;
-  int out_channels = chout;
-  int out_h = hout;
-  int out_w = wout;
-
-  int in_channel = chin;
-  int in_h = hin;
-  int in_w = win;
-  int out_c_group = out_channels / group;
-  int in_c_group = in_channel / group;
-
-  for (int n = 0; n < in_num; ++n) {
-    for (int g = 0; g < group; ++g) {
-      for (int oc = 0; oc < out_c_group; ++oc) {
-        for (int oh = 0; oh < out_h; ++oh) {
-          for (int ow = 0; ow < out_w; ++ow) {
-            int out_idx = n * group * out_c_group * out_h * out_w +
-                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
-                          oh * out_w + ow;
-            Dtype2 bias_d =
-                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
-            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
-            for (int ic = 0; ic < in_c_group; ++ic) {
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int iw = ow * stride_w - pad_w + kw * (dila_w);
-                  int ih = oh * stride_h - pad_h + kh * (dila_h);
-                  if (iw < 0 || iw >= in_w) continue;
-                  if (ih < 0 || ih >= in_h) continue;
-
-                  int iidx = n * in_channel * in_h * in_w +
-                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
-                             ih * in_w + iw;
-                  int widx =
-                      g * out_c_group * in_c_group * kernel_h * kernel_w +
-                      oc * in_c_group * kernel_h * kernel_w +
-                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
-
-                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
-                }
-              }
-            }
-            if (flag_relu) {
-              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
-                                          ? dst_data_ref[out_idx]
-                                          : (Dtype2)0;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-TEST(conv2d_1x1, compute) {
-  // conv infos
-  const int ksize = 1;
-  const int stride = 1;
-  const int pad = 0;
-  const int group = 1;
-  const int dilation = 0;
-//  int loop_cnt = 0;
-
-#ifdef LOOP_TEST
-  for (int batch_size = 1; batch_size < 4; ++batch_size) {
-    for (int oc = 4; oc < 10; oc += 1) {                       // oc
-      for (int ih = 4; ih < 9; ih += 1) {                      // ih
-        /*int iw = ih;*/ for (int iw = 4; iw < 10; iw += 1) {  // iw
-          for (int ic = 4; ic < 10; ic += 1) {                 // ic
-            for (bool bias_flag : {true, false}) {
-              for (bool relu_flag : {true, false}) {
-#else
-  const int batch_size = 1;
-  const int oc = 4;
-  const int ih = 8;
-  const int iw = 8;
-  const int ic = 4;
-  const bool bias_flag = false;
-  const bool relu_flag = false;
-#endif
-                const int oh = ih;
-                const int ow = iw;
-
-                VLOG(4) << "to get kernel ...";
-                auto kernels =
-                    KernelRegistry::Global().Create("conv2d_1x1",
-                                                    TARGET(kOpenCL),
-                                                    PRECISION(kFloat),
-                                                    DATALAYOUT(kImageDefault));
-                ASSERT_FALSE(kernels.empty());
-
-                auto kernel = std::move(kernels.front());
-                VLOG(4) << "created conv2d_1x1 kernel";
-
-                VLOG(4) << "prepare kernel ------";
-
-                lite::Tensor input, filter, bias, output;
-                operators::ConvParam param;
-                param.x = &input;
-                param.filter = &filter;
-                param.output = &output;
-                if (bias_flag) {
-                  param.bias = &bias;
-                }
-                param.fuse_relu = relu_flag;
-
-                std::vector<int> paddings = {pad, pad, pad, pad};
-                std::vector<int> dilations = {dilation, dilation};
-
-                param.paddings = std::make_shared<std::vector<int>>(paddings);
-                param.dilations = std::make_shared<std::vector<int>>(dilations);
-                param.strides = std::vector<int>{stride, stride};
-
-                std::unique_ptr<KernelContext> context(new KernelContext);
-                context->As<OpenCLContext>().InitOnce();
-
-                std::unique_ptr<KernelContext> conv_1x1_context(
-                    new KernelContext);
-                context->As<OpenCLContext>().CopySharedTo(
-                    &(conv_1x1_context->As<OpenCLContext>()));
-                kernel->SetContext(std::move(conv_1x1_context));
-
-                const DDim& input_dim =
-                    lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
-
-                const DDim& filter_dim =
-                    lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
-                const DDim& out_dim =
-                    lite::DDim{std::vector<int64_t>({batch_size, oc, ih, iw})};
-                // element wise bias
-                const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
-
-                param.x->Resize(input_dim);
-                param.filter->Resize(filter_dim);
-                param.output->Resize(out_dim);
-                if (bias_flag) {
-                  param.bias->Resize(bias_dim);
-                }
-
-                kernel->SetParam(param);
-
-                size_t input_image_width = iw * ((ic + 3) / 4);
-                size_t input_image_height = ih * batch_size;
-
-                size_t out_image_width = ow * ((oc + 3) / 4);
-                size_t out_image_height = oh * batch_size;
-
-                size_t bias_image_width = ow * ((oc + 3) / 4);
-                size_t bias_image_height = oh * batch_size;
-
-                size_t filter_image_width = ksize * ((oc + 3) / 4);
-                size_t filter_image_height = ic * ksize;
-
-                const size_t cl_image2d_row_pitch{0};
-                const size_t cl_image2d_slice_pitch{0};
-
-                std::default_random_engine engine;
-                std::uniform_real_distribution<float> gen(-5, 5);
-
-                std::vector<float> input_v(batch_size * ic * ih * iw);
-                std::vector<float> filter_v(oc * ic * ksize * ksize);
-                std::vector<float> output_v(batch_size * oc * ih * iw);
-                std::vector<float> bias_v(oc);
-
-                VLOG(4) << "gen input and filter ...";
-
-                for (auto& i : input_v) {
-                  i = gen(engine);
-                }
-                for (auto& f : filter_v) {
-                  f = gen(engine);
-                }
-
-                VLOG(4) << "after gen input and filter ...";
-                VLOG(4) << "input_v.size(): " << input_v.size();
-                VLOG(4) << "filter_v.size(): " << filter_v.size();
-                VLOG(4) << "output_v.size(): " << output_v.size();
-                VLOG(4) << "bias_v.size(): " << bias_v.size();
-                VLOG(4) << "input_dim.production(): " << input_dim.production();
-                VLOG(4) << "filter_dim.production(): "
-                        << filter_dim.production();
-                VLOG(4) << "out_dim.production(): " << out_dim.production();
-                VLOG(4) << "bias_dim.production(): " << bias_dim.production();
-                VLOG(4) << "4 * input_image_height * input_image_width: "
-                        << 4 * input_image_height * input_image_width;
-                VLOG(4) << "4 * filter_image_width * filter_image_height: "
-                        << 4 * filter_image_width * filter_image_height;
-
-                CHECK(input_dim.production() == input_v.size());
-                CHECK_LE(input_dim.production(),
-                         4 * input_image_height * input_image_width);
-                CHECK(filter_dim.production() == filter_v.size());
-                CHECK_LE(filter_dim.production(),
-                         4 * filter_image_width * filter_image_height);
-
-                paddle::lite::CLImageConverterDefault default_convertor;
-                VLOG(4) << "set mapped input  ...";
-                std::vector<float> x_image_v(
-                    input_image_width * input_image_height * 4);  // 4 : RGBA
-                std::vector<float> filter_image_v(
-                    filter_image_width * filter_image_height * 4);  // 4 : RGBA
-                std::vector<float> bias_image_v(
-                    bias_image_width * bias_image_height * 4);  // 4 : RGBA
-                std::vector<float> out_image_v(
-                    out_image_width * out_image_height * 4);  // 4 : RGBA
-
-                default_convertor.NCHWToImage(
-                    input_v.data(), x_image_v.data(), input_dim);
-
-                /*                for (int j = 0; j < input_v.size(); j += 1) {
-                                  //                  VLOG(4) << "input_v
-                   input[" << j << "]:
-                                  //                  " << input_v.data()[j];
-                                  std::cout << j << "  " << input_v.data()[j] <<
-                   std::endl;
-                                }
-                                std::cout << std::endl;
-
-                                for (int j = 0; j < x_image_v.size(); j += 1) {
-                                  //                  VLOG(4) << "x_image_v
-                   input[" << j <<
-                                  //                  "]: " <<
-                   x_image_v.data()[j];
-                                  std::cout << j << "  " << x_image_v.data()[j]
-                   << std::endl;
-                                }*/
-
-                VLOG(4) << "set mapped filter  ...";
-                paddle::lite::CLImageConverterNWBlock nw_convertor;
-                nw_convertor.NCHWToImage(
-                    filter_v.data(), filter_image_v.data(), filter_dim);
-
-                auto* input_image2d = input.mutable_data<float, cl::Image2D>(
-                    input_image_width, input_image_height, x_image_v.data());
-                auto* filter_image2d = filter.mutable_data<float, cl::Image2D>(
-                    filter_image_width,
-                    filter_image_height,
-                    filter_image_v.data());
-
-                if (bias_flag) {
-                  nw_convertor.NCHWToImage(
-                      filter_v.data(), filter_image_v.data(), filter_dim);
-
-                  for (int i = 0; i < bias_dim.production(); ++i) {
-                    bias_v[i] = static_cast<int>(gen(engine));
-                  }
-                  CLImageConverterFolder folder_convertor;
-                  folder_convertor.NCHWToImage(
-                      bias_v.data(), bias_image_v.data(), bias_dim);
-                  auto* bias_data = bias.mutable_data<float, cl::Image2D>(
-                      bias_image_width, bias_image_height, bias_image_v.data());
-                }
-
-                VLOG(4) << "resize output  ...";
-                output.Resize(out_dim);
-
-                // cpu conv basic calc
-                lite::Tensor out_ref;
-                out_ref.Resize(out_dim);
-
-                VLOG(4) << "prepare kernel ready";
-
-                VLOG(4) << "kernel launch ...";
-                kernel->Launch();
-                VLOG(4) << "mutable output ...";
-                auto* output_image2d = output.mutable_data<float, cl::Image2D>(
-                    out_image_width, out_image_height);
-
-                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-                auto* out_ptr = param.output->data<float, cl::Image2D>();
-                auto it = wait_list->find(out_ptr);
-
-                if (it != wait_list->end()) {
-                  VLOG(4) << "--- Find the sync event for the target cl "
-                             "tensor. ---";
-                  auto& event = *(it->second);
-                  event.wait();
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
-
-                TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                            output.data<float, cl::Image2D>(),
-                                            out_image_width,
-                                            out_image_height,
-                                            cl_image2d_row_pitch,
-                                            cl_image2d_slice_pitch,
-                                            IoDirection::DtoH);
-
-                DDim out_image_shape =
-                    default_convertor.InitImageDimInfoWith(output.dims());
-
-                default_convertor.ImageToNCHW(out_image_v.data(),
-                                              output_v.data(),
-                                              out_image_shape,
-                                              output.dims());
-                VLOG(4) << "mutable_data out_ref_data: ";
-
-                // run cpu ref
-                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-
-                VLOG(4) << " conv_basic beigin ..... ";
-
-                conv_basic<float, float>(input_v.data(),
-                                         out_ref_data,
-                                         batch_size,
-                                         oc,
-                                         oh,
-                                         ow,
-                                         ic,
-                                         ih,
-                                         iw,
-                                         filter_v.data(),
-                                         bias_v.data(),  // mapped_bias,
-                                         group,
-                                         ksize,
-                                         ksize,
-                                         stride,
-                                         stride,
-                                         dilation,
-                                         dilation,
-                                         pad,
-                                         pad,
-                                         bias_flag,
-                                         relu_flag);
-                VLOG(4) << " conv_basic end ..... ";
-
-                VLOG(4) << " out_dim: " << out_dim;
-                const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
-                    {static_cast<int64_t>(out_image_width),
-                     static_cast<int64_t>(out_image_height)})};
-
-                for (int i = 0; i < out_dim.production(); i++) {
-                  EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
-                  if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
-                    LOG(FATAL) << "error idx:" << i;
-                  }
-                }
-
-#ifdef LOOP_TEST
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-// nothing to do.
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc
index c3d3e2a6c27f794268ef42ac97ab492ddd4e9de1..d00101552d4376bc4ac2a176016c1a9a449c35a7 100644
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -362,6 +362,44 @@ void ConvImageCompute::PrepareForRun() {
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d1x1;
+#if 1  // TODO(ysh329): enable general dwconv
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]) {
+#else  // TODO(ysh329): remove dwconv3x3s1 and dwconv3x3 temporarily, need fix
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
+             kernel_h == 3 && kernel_w == 3 && groups > 1) {
+    // depth_conv2d_3x3s1, depth_conv2d_3x3
+    if (stride_h == 1 && dilations[0] == 1) {
+      kernel_func_names_.push_back("depth_conv2d_3x3s1");
+      impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
+    } else {
+      kernel_func_names_.push_back("depth_conv2d_3x3");
+      impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
+    }
+    kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
+
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
+             kernel_h != 3) {
+#endif
+    // depth_conv2d
+    kernel_func_names_.push_back("depth_conv2d");
+    kernel_func_paths_.push_back("image/depthwise_conv2d_basic_kernel.cl");
+
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::DepthwiseConv2d;
   } else if (kernel_h == 3 && kernel_h == 3) {
     // conv2d_3x3
     kernel_func_names_.push_back("conv2d_3x3");
@@ -407,6 +445,8 @@ void ConvImageCompute::PrepareForRun() {
   } else {
     LOG(FATAL) << "conv image compute not support this condition yet! ";
   }
+  VLOG(1) << "kernel_func_names_[0]:" << kernel_func_names_[0]
+          << " kernel_func_paths_[0]:" << kernel_func_paths_[0];
 
   std::string build_options_single(" -DCL_DTYPE_float");
   // relu options
@@ -1064,6 +1104,326 @@ void ConvImageCompute::Conv2d7x7() {
   context.cl_wait_list()->emplace(out_image, event_);
 }
 
+void ConvImageCompute::DepthwiseConv2d3x3s1() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* input_img = param.x->data<float, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<float, cl::Image2D>();
+
+  const cl::Image2D* bias_img = nullptr;
+  if (param.bias) {
+    bias_img = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto image_shape = InitImageDimInfoWith(output_dims);
+
+  auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int c_block = (output_dims[1] + 3) / 4;
+  int w = output_dims[3];
+  int nh = output_dims[0] * output_dims[2];
+
+  int w_blk_size = 2;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+  auto global_work_size = cl::NDRange(c_block, w_blk, nh);
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *output_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+  CL_CHECK_FATAL(status);
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(output_img, event_);
+}
+
+void ConvImageCompute::DepthwiseConv2d3x3() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+  int offset = filter_dims[2] / 2 - paddings[0];
+  int input_c_block = (x_dims[1] + 3) / 4;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* input_img = param.x->data<float, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<float, cl::Image2D>();
+
+  const cl::Image2D* bias_img = nullptr;
+  if (param.bias) {
+    bias_img = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto image_shape = InitImageDimInfoWith(output_dims);
+
+  auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int c_block = (output_dims[1] + 3) / 4;
+  int w = output_dims[3];
+  int nh = output_dims[0] * output_dims[2];
+  auto global_work_size = cl::NDRange(c_block, w, nh);
+
+  VLOG(4) << "setArg";
+  VLOG(4) << "c_block = " << c_block;
+  VLOG(4) << "w = " << w;
+  VLOG(4) << "nh = " << nh;
+
+  VLOG(4) << "strides = " << strides[0];
+  VLOG(4) << "offset = " << offset;
+  VLOG(4) << "dilations = " << dilations[0];
+  VLOG(4) << "input_c_block = " << input_c_block;
+  VLOG(4) << "x_dims[3] = " << x_dims[3];
+  VLOG(4) << "x_dims[2] = " << x_dims[2];
+  VLOG(4) << "output_dims[3] = " << output_dims[3];
+  VLOG(4) << "output_dims[2] = " << output_dims[2];
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(w));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *output_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+  CL_CHECK_FATAL(status);
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(output_img, event_);
+}
+
+void ConvImageCompute::DepthwiseConv2d() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ depthwise conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  // handle bias  use buffer for channel wise , use image for element wise
+  const cl::Buffer* bias_buf = nullptr;
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
 void ConvImageCompute::Run() { (this->*impl_)(); }
 
 }  // namespace opencl
@@ -1071,19 +1431,37 @@ void ConvImageCompute::Run() { (this->*impl_)(); }
 }  // namespace lite
 }  // namespace paddle
 
+// REGISTER_LITE_KERNEL(conv2d,
+//                      kOpenCL,
+//                      kFloat,
+//                      kNCHW,
+//                      paddle::lite::kernels::opencl::ConvCompute,
+//                      def)
+//     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .Finalize();
+
 REGISTER_LITE_KERNEL(conv2d,
                      kOpenCL,
                      kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::ConvCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ConvImageCompute,
+                     image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(conv2d,
+REGISTER_LITE_KERNEL(depthwise_conv2d,
                      kOpenCL,
                      kFloat,
                      kImageDefault,
diff --git a/lite/kernels/opencl/conv_compute.h b/lite/kernels/opencl/conv_compute.h
index d5dd65cdc855ebc25624e8316866a5944a2418b8..672ba9d223031edf1ebc3d955908c4ab8edc0834 100644
--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
@@ -74,6 +74,9 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   void Conv2d3x3();
   void Conv2d5x5();
   void Conv2d7x7();
+  void DepthwiseConv2d3x3s1();
+  void DepthwiseConv2d3x3();
+  void DepthwiseConv2d();
 
   kernel_t impl_;
   std::vector<std::string> kernel_func_names_{};
diff --git a/lite/kernels/opencl/conv_compute_test.cc b/lite/kernels/opencl/conv_compute_test.cc
index 1c7cca63ae4d1c0a5183b512827f4b6943f994af..af59873336fb154b34d7ada398d7fe8e568e7655 100644
--- a/lite/kernels/opencl/conv_compute_test.cc
+++ b/lite/kernels/opencl/conv_compute_test.cc
@@ -166,6 +166,8 @@ void PrintData(std::string name,
   }
 }
 
+// buffer
+#if 0
 // #define PRINT_RESULT
 #define LOOP_TEST
 TEST(conv2d, compute_conv2d_1x1) {
@@ -623,8 +625,9 @@ TEST(conv2d, compute_conv2d_gemm) {
   }              // batch_size
 #endif
 }
+#endif
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/conv_image2d_compute_test.cc b/lite/kernels/opencl/conv_image2d_compute_test.cc
index 3e698a4ae838a74882317014df42cee9d2c7961c..4c81978b405e3acb4bc0e3ecc44b1ec10ac903b7 100644
--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc
@@ -559,9 +559,11 @@ TEST(conv2d, compute_image2d_3x3) {
               // element wise bias
               const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
 
-              LOG(INFO) << "input_dim:" << input_dim
-                        << " filter_dim:" << filter_dim
-                        << " out_dim:" << out_dim;
+              VLOG(2) << "input_dim:" << input_dim
+                      << " filter_dim:" << filter_dim << " out_dim:" << out_dim
+                      << " bias_flag:" << bias_flag << " bias_dim:" << bias_dim
+                      << " group:" << group << " stride:" << stride
+                      << " pad:" << pad << " dilation:" << dilation;
 
               param.x->Resize(input_dim);
               param.filter->Resize(filter_dim);
@@ -902,6 +904,12 @@ TEST(conv2d, compute_image2d_5x5) {
               // element wise bias
               const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
 
+              VLOG(2) << "input_dim:" << input_dim
+                      << " filter_dim:" << filter_dim << " out_dim:" << out_dim
+                      << " bias_flag:" << bias_flag << " bias_dim:" << bias_dim
+                      << " group:" << group << " stride:" << stride
+                      << " pad:" << pad << " dilation:" << dilation;
+
               param.x->Resize(input_dim);
               param.filter->Resize(filter_dim);
               param.output->Resize(out_dim);
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_compute.cc
index 554cc87c5f21e283316df402d195ec8bf8c4d738..0c88509926041411eddac66bea08b5d3a08d6a3c 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
@@ -123,420 +123,6 @@ class DepthwiseConv2dCompute
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
-class DepthwiseConv2dComputeFP16Image
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-
-  std::string doc() const override {
-    return "DepthwiseConv2d using cl::Image2D/kImageDefault, kFP16";
-  }
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    if (param.fuse_relu) {
-      build_options_ += " -DRELU";
-    } else if (param.activation_param.active_type ==
-               lite_api::ActivationType::kRelu6) {
-      build_options_ += " -DRELU6";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto x_dims = param.x->dims();
-    auto filter_dims = param.filter->dims();
-    auto output_dims = param.output->dims();
-    auto paddings = *param.paddings;
-    auto strides = param.strides;
-    auto dilations = *param.dilations;
-    int offset = filter_dims[2] / 2 - paddings[0];
-    int input_c_block = (x_dims[1] + 3) / 4;
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* input_img = param.x->data<int16_t, cl::Image2D>();
-    auto* filter_img = param.filter->data<int16_t, cl::Image2D>();
-
-    auto* bias_img = param.bias == nullptr
-                         ? static_cast<cl::Image2D*>(nullptr)
-                         : param.bias->data<int16_t, cl::Image2D>();
-
-    auto image_shape = InitImageDimInfoWith(output_dims);
-
-    auto* output_img = param.output->mutable_data<int16_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int c_block = (output_dims[1] + 3) / 4;
-    int w = output_dims[3];
-    int nh = output_dims[0] * output_dims[2];
-    auto global_work_size = cl::NDRange(c_block, w, nh);
-
-    VLOG(4) << "setArg";
-    VLOG(4) << "c_block = " << c_block;
-    VLOG(4) << "w = " << w;
-    VLOG(4) << "nh = " << nh;
-
-    VLOG(4) << "strides = " << strides[0];
-    VLOG(4) << "offset = " << offset;
-    VLOG(4) << "dilations = " << dilations[0];
-    VLOG(4) << "input_c_block = " << input_c_block;
-    VLOG(4) << "x_dims[3] = " << x_dims[3];
-    VLOG(4) << "x_dims[2] = " << x_dims[2];
-    VLOG(4) << "output_dims[3] = " << output_dims[3];
-    VLOG(4) << "output_dims[2] = " << output_dims[2];
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(w));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *filter_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *output_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
-    CL_CHECK_FATAL(status);
-
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_img, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"depth_conv2d_3x3"};
-  std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class DepthwiseConv2d3x3s1ComputeFP16Image
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-
-  std::string doc() const override {
-    return "DepthwiseConv2d3x3s1 using cl::Image2D/kImageDefault, kFP16";
-  }
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    if (param.fuse_relu) {
-      build_options_ += " -DRELU";
-    } else if (param.activation_param.active_type ==
-               lite_api::ActivationType::kRelu6) {
-      build_options_ += " -DRELU6";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto x_dims = param.x->dims();
-    auto filter_dims = param.filter->dims();
-    auto output_dims = param.output->dims();
-    auto paddings = *param.paddings;
-    auto strides = param.strides;
-    auto dilations = *param.dilations;
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* input_img = param.x->data<int16_t, cl::Image2D>();
-    auto* filter_img = param.filter->data<int16_t, cl::Image2D>();
-
-    auto* bias_img = param.bias == nullptr
-                         ? static_cast<cl::Image2D*>(nullptr)
-                         : param.bias->data<int16_t, cl::Image2D>();
-
-    auto image_shape = InitImageDimInfoWith(output_dims);
-
-    auto* output_img = param.output->mutable_data<int16_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int c_block = (output_dims[1] + 3) / 4;
-    int w = output_dims[3];
-    int nh = output_dims[0] * output_dims[2];
-
-    int w_blk_size = 2;
-    int w_blk = (w + w_blk_size - 1) / w_blk_size;
-
-    auto global_work_size = cl::NDRange(c_block, w_blk, nh);
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *filter_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *output_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
-    CL_CHECK_FATAL(status);
-
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_img, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"depth_conv2d_3x3s1"};
-  std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class DepthwiseConv2dBasicComputeFP32Image
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-
-  std::string doc() const override {
-    return "DepthwiseConv2d basic using cl::Image2D/kImageDefault, kFloat32";
-  }
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const bool has_bias = param.bias != nullptr;
-    const bool is_element_wise_bias =
-        has_bias && param.output->dims() == param.bias->dims();
-    if (param.fuse_relu) {
-      build_options_ += " -DRELU";
-    } else if (param.activation_param.active_type ==
-               lite_api::ActivationType::kRelu6) {
-      build_options_ += " -DRELU6";
-    }
-    if (has_bias) {
-      build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(kernel_func_name_,
-                                    "image/depthwise_conv2d_basic_kernel.cl",
-                                    build_options_);
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto input_dims = param.x->dims();
-    auto paddings = *param.paddings;
-    auto strides = param.strides;
-    auto* input_image = param.x->data<float, cl::Image2D>();
-    auto* filter_image = param.filter->data<float, cl::Image2D>();
-    auto filter_dims = param.filter->dims();
-    auto output_dims = param.output->dims();
-
-    int input_width = input_dims[3];
-    int input_height = input_dims[2];
-    int output_width = output_dims[3];
-    int output_height = output_dims[2];
-    int filter_width = filter_dims[3];
-    int filter_height = filter_dims[2];
-    auto out_image_shape = InitImageDimInfoWith(output_dims);
-    auto* out_image = param.output->mutable_data<float, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-
-    const bool has_bias = param.bias != nullptr;
-    const bool is_element_wise_bias =
-        has_bias && param.output->dims() == param.bias->dims();
-    int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-                 static_cast<int>(paddings[0]);
-
-    // calc input_c_block
-    auto input_image_shape = InitImageDimInfoWith(input_dims);
-    int input_c_block = input_image_shape["width"] / input_dims[3];
-    int input_c = input_dims[1];
-    auto dilations = *param.dilations;
-
-    const std::vector<size_t>& default_work_size =
-        DefaultWorkSize(output_dims,
-                        DDim(std::vector<DDim::value_type>{
-                            static_cast<int64_t>(out_image_shape["width"]),
-                            static_cast<int64_t>(out_image_shape["height"])}));
-
-    int c_block = default_work_size[0];
-    int w = default_work_size[1];
-    int nh = default_work_size[2];
-
-    VLOG(4) << "============ depthwise conv2d params ============";
-    VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-            << input_image_shape["height"];
-    VLOG(4) << "input_c_block: " << input_c_block;
-    VLOG(4) << "input_c: " << input_c;
-    VLOG(4) << "input_image: " << input_image;
-    VLOG(4) << "filter_dims: " << filter_dims;
-    VLOG(4) << "filter_image: " << filter_image;
-    VLOG(4) << "output_dims: " << output_dims;
-    VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-            << out_image_shape["height"];
-    VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-    VLOG(4) << "has bias: " << has_bias;
-    VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-    VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-    VLOG(4) << "offset: " << offset;
-    VLOG(4) << "dilations.size : " << dilations.size();
-    VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-    VLOG(4) << "default work size{c_block, w, nh}: "
-            << "{" << c_block << ", " << w << ", " << nh << ""
-            << "}";
-
-    CHECK_GE(dilations.size(), 2);
-    CHECK(dilations[0] == dilations[1]);
-    CHECK_GE(input_dims.size(), 4);
-    CHECK_GE(paddings.size(), 2);
-    CHECK(paddings[0] == paddings[1]);
-    CHECK_GE(strides.size(), 2);
-    CHECK(strides[0] == strides[1]);
-
-    // handle bias  use buffer for channel wise , use image for element wise
-    const cl::Buffer* bias_buf = nullptr;
-    const cl::Image2D* bias_image = nullptr;
-    if (has_bias) {
-      bias_image = param.bias->data<float, cl::Image2D>();
-    }
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    VLOG(4) << "kernel_key: " << kernel_key.str();
-    VLOG(4) << "kernel ready ... " << kernel_key.str();
-    VLOG(4) << "w: " << w;
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, c_block);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, w);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, nh);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_image);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *filter_image);
-    CL_CHECK_FATAL(status);
-    if (has_bias) {
-      VLOG(4) << "set bias_image: ";
-      status = kernel.setArg(++arg_idx, *bias_image);
-      CL_CHECK_FATAL(status);
-    }
-    status = kernel.setArg(++arg_idx, *out_image);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, strides[0]);
-    CL_CHECK_FATAL(status);
-
-    status = kernel.setArg(++arg_idx, offset);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_c_block);
-    CL_CHECK_FATAL(status);
-
-    status = kernel.setArg(++arg_idx, dilations[0]);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, input_height);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, output_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, output_height);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, filter_width);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, filter_height);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size =
-        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                    static_cast<size_t>(default_work_size.data()[1]),
-                    static_cast<size_t>(default_work_size.data()[2])};
-
-    VLOG(4) << "out_image: " << out_image;
-    VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-            << global_work_size[1] << "," << global_work_size[2] << "}";
-
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_image, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"depth_conv2d"};
-  std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
@@ -553,52 +139,3 @@ REGISTER_LITE_KERNEL(depthwise_conv2d,
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .Finalize();
-
-REGISTER_LITE_KERNEL(
-    depthwise_conv2d,
-    kOpenCL,
-    kFP16,
-    kImageDefault,
-    paddle::lite::kernels::opencl::DepthwiseConv2dComputeFP16Image,
-    image2d)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Bias",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageNW))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    depthwise_conv2d_basic,
-    kOpenCL,
-    kFloat,
-    kImageDefault,
-    paddle::lite::kernels::opencl::DepthwiseConv2dBasicComputeFP32Image,
-    image2d)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Bias",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageNW))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
index c52aa87a73c8f9cbd91851c96162cde817f299b4..40cfdfffab452a004d45d804f62309dc71e0b0d9 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
@@ -177,135 +177,7 @@ TEST(depthwise_conv2d_buffer_fp32, compute) {
   TargetWrapperCL::Unmap(input_data, mapped_input);
 }
 
-TEST(depthwise_conv2d_image2d_fp16, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
-                                                 TARGET(kOpenCL),
-                                                 PRECISION(kFP16),
-                                                 DATALAYOUT(kImageDefault));
-  ASSERT_FALSE(kernels.empty());
-
-  auto kernel = std::move(kernels.front());
-
-  LOG(INFO) << "get kernel";
-  lite::Tensor input, filter, output;
-  operators::ConvParam param;
-  param.x = &input;
-  param.filter = &filter;
-  param.output = &output;
-  std::vector<int> paddings = {0, 0};
-  param.paddings = std::make_shared<std::vector<int>>(paddings);
-  param.strides = std::vector<int>{1, 1};
-  std::vector<int> dilations = {1, 1};
-  param.dilations = std::make_shared<std::vector<int>>(dilations);
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> dep_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(dep_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(dep_context));
-
-  LOG(INFO) << "kernel ready";
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> gen(-5, 5);
-  std::vector<float> input_v(1 * 32 * 112 * 112);
-  std::vector<float> filter_v(32 * 1 * 3 * 3);
-  for (auto& i : input_v) {
-    i = gen(engine);
-  }
-  for (auto& f : filter_v) {
-    f = gen(engine);
-  }
-
-  LOG(INFO) << "prepare input";
-  input.Resize({1, 32, 112, 112});
-  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
-  DDim input_image_shape =
-      default_converter->InitImageDimInfoWith(input.dims());
-  LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
-            << input_image_shape[1];
-  std::vector<float> input_image_data(input_image_shape.production() *
-                                      4);  // 4 : RGBA
-  default_converter->NCHWToImage(
-      input_v.data(), input_image_data.data(), input.dims());
-  auto* input_image = input.mutable_data<int16_t, cl::Image2D>(
-      input_image_shape[0], input_image_shape[1], input_image_data.data());
-
-  LOG(INFO) << "prepare kernel";
-  filter.Resize({32, 1, 3, 3});
-  CLImageConverterNWBlock* nw_converter = new CLImageConverterNWBlock();
-  DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
-  LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
-            << filter_image_shape[1];
-  std::vector<float> filter_image_data(filter_image_shape.production() *
-                                       4);  // 4 : RGBA
-  nw_converter->NCHWToImage(
-      filter_v.data(), filter_image_data.data(), filter.dims());
-  auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>(
-      filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
-
-  LOG(INFO) << "launch";
-  output.Resize({1, 32, 110, 110});
-  DDim output_image_shape =
-      default_converter->InitImageDimInfoWith(output.dims());
-  LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
-            << output_image_shape[1];
-  auto* output_image = output.mutable_data<int16_t, cl::Image2D>(
-      output_image_shape[0], output_image_shape[1]);
-
-  kernel->Launch();
-
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<int16_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-    LOG(INFO) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  lite::Tensor output_ref;
-  output_ref.Resize({1, 32, 110, 110});
-  auto* output_ref_data = output_ref.mutable_data<float>(TARGET(kARM));
-  depth_conv<float, 1, 1>(input_v.data(),
-                          input.dims(),
-                          filter_v.data(),
-                          filter.dims(),
-                          output_ref_data,
-                          output_ref.dims());
-
-  const size_t cl_image2d_row_pitch{0};
-  const size_t cl_image2d_slice_pitch{0};
-
-  float* output_image_data = new float[output_image_shape.production() * 4];
-  TargetWrapperCL::ImgcpySync(output_image_data,
-                              output_image,
-                              output_image_shape[0],
-                              output_image_shape[1],
-                              cl_image2d_row_pitch,
-                              cl_image2d_slice_pitch,
-                              IoDirection::DtoH);
-
-  float* output_data = new float[output_image_shape.production() * 4];
-  default_converter->ImageToNCHW(
-      output_image_data, output_data, output_image_shape, output.dims());
-
-  LOG(INFO) << "output_data vs output_ref_data";
-  for (int i = 0; i < output.dims().production(); i++) {
-    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
-    LOG(INFO) << output_data[i] << " " << output_ref_data[i];
-  }
-}
-
 }  // namespace lite
 }  // namespace paddle
 
 USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/depthwise_conv2d_basic_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
similarity index 72%
rename from lite/kernels/opencl/depthwise_conv2d_basic_compute_test.cc
rename to lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
index 96ee99e538cc2f293d1f97b2b70a678a0a8ef7b9..1b96ffe0502c3e2d654f88e9c9ac35d20704ca01 100644
--- a/lite/kernels/opencl/depthwise_conv2d_basic_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
@@ -142,7 +142,7 @@ TEST(depthwise_conv2d_basic, compute) {
 
           VLOG(4) << "to get kernel ...";
           auto kernels =
-              KernelRegistry::Global().Create("depthwise_conv2d_basic",
+              KernelRegistry::Global().Create("depthwise_conv2d",
                                               TARGET(kOpenCL),
                                               PRECISION(kFloat),
                                               DATALAYOUT(kImageDefault));
@@ -383,7 +383,133 @@ TEST(depthwise_conv2d_basic, compute) {
 #endif
 }
 
+TEST(depthwise_conv2d_image2d_fp16, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFloat),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel";
+  lite::Tensor input, filter, output;
+  operators::ConvParam param;
+  param.x = &input;
+  param.filter = &filter;
+  param.output = &output;
+  std::vector<int> paddings = {0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.strides = std::vector<int>{1, 1};
+  std::vector<int> dilations = {1, 1};
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> dep_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(dep_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(dep_context));
+
+  LOG(INFO) << "kernel ready";
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> gen(-5, 5);
+  std::vector<float> input_v(1 * 32 * 112 * 112);
+  std::vector<float> filter_v(32 * 1 * 3 * 3);
+  for (auto& i : input_v) {
+    i = gen(engine);
+  }
+  for (auto& f : filter_v) {
+    f = gen(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  input.Resize({1, 32, 112, 112});
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim input_image_shape =
+      default_converter->InitImageDimInfoWith(input.dims());
+  LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
+            << input_image_shape[1];
+  std::vector<float> input_image_data(input_image_shape.production() *
+                                      4);  // 4 : RGBA
+  default_converter->NCHWToImage(
+      input_v.data(), input_image_data.data(), input.dims());
+  auto* input_image = input.mutable_data<int16_t, cl::Image2D>(
+      input_image_shape[0], input_image_shape[1], input_image_data.data());
+
+  LOG(INFO) << "prepare kernel";
+  filter.Resize({32, 1, 3, 3});
+  CLImageConverterNWBlock* nw_converter = new CLImageConverterNWBlock();
+  DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
+  LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
+            << filter_image_shape[1];
+  std::vector<float> filter_image_data(filter_image_shape.production() *
+                                       4);  // 4 : RGBA
+  nw_converter->NCHWToImage(
+      filter_v.data(), filter_image_data.data(), filter.dims());
+  auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>(
+      filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
+
+  LOG(INFO) << "launch";
+  output.Resize({1, 32, 110, 110});
+  DDim output_image_shape =
+      default_converter->InitImageDimInfoWith(output.dims());
+  LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
+            << output_image_shape[1];
+  auto* output_image = output.mutable_data<int16_t, cl::Image2D>(
+      output_image_shape[0], output_image_shape[1]);
+
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<int16_t, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+    LOG(INFO) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  lite::Tensor output_ref;
+  output_ref.Resize({1, 32, 110, 110});
+  auto* output_ref_data = output_ref.mutable_data<float>(TARGET(kARM));
+  depth_conv<float, 1, 1>(input_v.data(),
+                          input.dims(),
+                          filter_v.data(),
+                          filter.dims(),
+                          output_ref_data,
+                          output_ref.dims());
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  float* output_image_data = new float[output_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(output_image_data,
+                              output_image,
+                              output_image_shape[0],
+                              output_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+
+  float* output_data = new float[output_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      output_image_data, output_data, output_image_shape, output.dims());
+
+  LOG(INFO) << "output_data vs output_ref_data";
+  for (int i = 0; i < output.dims().production(); i++) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+    LOG(INFO) << output_data[i] << " " << output_ref_data[i];
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_KERNEL(
-    depthwise_conv2d_basic, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/fc_compute_test.cc b/lite/kernels/opencl/fc_compute_test.cc
index 7f0c9c49a9920b10ceaa29cd1b548f59d5758f3b..863eab6297a88bcb2827c6ed09dfd1cecd7fae2d 100644
--- a/lite/kernels/opencl/fc_compute_test.cc
+++ b/lite/kernels/opencl/fc_compute_test.cc
@@ -66,6 +66,8 @@ void PrintData(std::string name, float* a, const int rows, const int cols) {
   }
 }
 
+// buffer
+#if 0  // fc_buffer
 // #define PRINT_RESULT
 #define LOOP_TEST
 TEST(fc, compute) {
@@ -193,8 +195,9 @@ TEST(fc, compute) {
   }      // m
 #endif
 }
+#endif  // fc_buffer
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_compute.cc
index fca2cbe96d56b65e5f33acacff20c781b3400ed0..c0a00e87b8ad67ba0028ff4fa57f0811d52c1f0a 100644
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
@@ -229,15 +229,15 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(pool2d,
-                     kOpenCL,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::PoolCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
+// REGISTER_LITE_KERNEL(pool2d,
+//                      kOpenCL,
+//                      kFloat,
+//                      kNCHW,
+//                      paddle::lite::kernels::opencl::PoolCompute,
+//                      def)
+//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .Finalize();
 
 REGISTER_LITE_KERNEL(pool2d,
                      kOpenCL,
diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_compute_test.cc
index f97c758469ece8f2eaf59ebbb1b5065d71641616..133fc619205c0fcd0fdfcd1203796e1e74e0c4e0 100644
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
@@ -73,6 +73,8 @@ void pool_avg(const int padding_height,
   }
 }
 
+// buffer
+#if 0   // pool_buffer
 TEST(pool2d_buffer_fp32, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels = KernelRegistry::Global().Create(
@@ -141,6 +143,7 @@ TEST(pool2d_buffer_fp32, compute) {
   }
   TargetWrapperCL::Unmap(out_data, mapped_out);
 }
+#endif  // pool_buffer
 
 TEST(pool2d_image2d_fp32, compute) {
   LOG(INFO) << "to get kernel ...";
@@ -239,5 +242,5 @@ TEST(pool2d_image2d_fp32, compute) {
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
 USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/sigmoid_compute.cc b/lite/kernels/opencl/sigmoid_compute.cc
index b6883ab2a4d5740736aa79b79ae569c2a0566006..2dae9d1ae70c0cc7e0bcd6781061c1f3fc7d927b 100644
--- a/lite/kernels/opencl/sigmoid_compute.cc
+++ b/lite/kernels/opencl/sigmoid_compute.cc
@@ -227,15 +227,15 @@ class SigmoidComputeFP16ImageDefault
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(sigmoid,
-                     kOpenCL,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::SigmoidCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
+// REGISTER_LITE_KERNEL(sigmoid,
+//                      kOpenCL,
+//                      kFloat,
+//                      kNCHW,
+//                      paddle::lite::kernels::opencl::SigmoidCompute,
+//                      def)
+//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//     .Finalize();
 
 REGISTER_LITE_KERNEL(
     sigmoid,
diff --git a/lite/kernels/opencl/sigmoid_compute_test.cc b/lite/kernels/opencl/sigmoid_compute_test.cc
index 1c9901b20877a1debefdb92c15490f04a0eb61cd..77bc03727c5d1e47d2771a04fd5673246d9550de 100644
--- a/lite/kernels/opencl/sigmoid_compute_test.cc
+++ b/lite/kernels/opencl/sigmoid_compute_test.cc
@@ -32,7 +32,8 @@ void sigmoid_compute_ref(const dtype *x_data,
   }
 }
 
-#if 1  // sigmoid_buffer
+// buffer
+#if 0   // sigmoid_buffer
 TEST(opencl_sigmoid_buffer, compute) {
   // prepare data
   const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
@@ -414,7 +415,7 @@ TEST(sigmoid_image2d_fp16, compute) {
 }  // namespace paddle
 
 // sigmoid buffer
-USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
 
 // sigmoid image2d fp32
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
diff --git a/lite/kernels/x86/activation_compute.h b/lite/kernels/x86/activation_compute.h
index 34a932ed1726b5b99337847b22e0a168e0290c84..5d8110e67c17f3a0f8d3211179df831dad83cc9b 100644
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -222,14 +222,6 @@ class GeluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 };
 
 // softsign(x) = x / (1 + |x|)
-template <typename T>
-struct SoftsignFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) {
-    out.device(d) = x / (static_cast<T>(1) + x.abs());
-  }
-};
-
 template <typename T>
 class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  public:
@@ -238,9 +230,13 @@ class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   void Run() override {
     // auto& context = ctx_->As<X86Context>();
     auto& param = *param_.get_mutable<operators::ActivationParam>();
-    param.Out->template mutable_data<T>();
 
-    Activate<SoftsignFunctor<T>>(param.X, param.Out);
+    const T* x_data = param.X->data<T>();
+    T* out_data = param.Out->mutable_data<T>();
+    size_t x_size = param.X->numel();
+    for (size_t i = 0; i < x_size; i++) {
+      out_data[i] = x_data[i] / (static_cast<T>(1) + std::abs(x_data[i]));
+    }
   }
 
   virtual ~SoftsignCompute() = default;
diff --git a/lite/kernels/x86/gru_compute.h b/lite/kernels/x86/gru_compute.h
index 948485105a763aeefbbd7a77b91a7eefdeb17b57..89076b51dae1fed4b8f56b280f177caf1f142158 100644
--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
@@ -48,6 +48,10 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context,
   row_shuffle(context, src, index_lod, dst, indexed_src);
 }
 
+static inline int64_t CalculateSeqWidth(const DDim& dims) {
+  return dims.count(1, dims.size());
+}
+
 template <typename T>
 class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  public:
@@ -65,15 +69,16 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto* bias = param.bias;
 
     auto* batch_gate = param.batch_gate;
-    batch_gate->mutable_data<T>();
     auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
-    batch_reset_hidden_prev->mutable_data<T>();
     auto* batch_hidden = param.batch_hidden;
-    batch_hidden->mutable_data<T>();
+    T* batch_gate_ptr = batch_gate->mutable_data<T>();
+    T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data<T>();
+    T* batch_hidden_ptr = batch_hidden->mutable_data<T>();
+
     auto* hidden = param.hidden;
     hidden->mutable_data<T>();
 
-    auto hidden_dims = hidden->dims();
+    const auto& hidden_dims = hidden->dims();
 
     lite::x86::math::LoDTensor2BatchFunctor<TARGET(kX86), T> to_batch;
     to_batch(context, *input, batch_gate, true, is_reverse);
@@ -90,19 +95,23 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
     Tensor ordered_h0;
 
-    std::vector<size_t> order(batch_gate->lod()[2]);
-
     if (h0) {
       // Since the batch computing for GRU reorders the input sequences
       // according to their length. The initialized cell state also needs
       // to reorder.
+      const std::vector<size_t>& order(batch_gate->lod()[2]);
       ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
       gru_value.prev_out_value = ordered_h0.mutable_data<T>();
     } else {
       gru_value.prev_out_value = nullptr;
     }
-    auto batch_starts = batch_gate->lod()[0];
+
+    const auto& batch_starts = batch_gate->lod()[0];
     size_t seq_len = batch_starts.size() - 1;
+    int64_t batch_gate_width = CalculateSeqWidth(batch_gate->dims());
+    int64_t batch_reset_hidden_prev_width =
+        CalculateSeqWidth(batch_reset_hidden_prev->dims());
+    int64_t batch_hidden_width = CalculateSeqWidth(batch_hidden->dims());
     auto active_node =
         lite::x86::math::detail::GetActivationType(param.activation);
     auto active_gate =
@@ -145,13 +154,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
         int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
         int64_t cur_batch_size = bend - bstart;
 
-        Tensor gate_t = batch_gate->Slice<T>(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice<T>(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend);
-        gru_value.output_value = hidden_t.mutable_data<T>();
-        gru_value.gate_value = gate_t.mutable_data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
+        gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width;
+        gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width;
+        gru_value.reset_output_value = batch_reset_hidden_prev_ptr +
+                                       bstart * batch_reset_hidden_prev_width;
 
         if (gru_value.prev_out_value) {
           blas.GEMM_COMPUTE(CblasNoTrans,
@@ -188,13 +194,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
         int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
         int64_t cur_batch_size = bend - bstart;
 
-        Tensor gate_t = batch_gate->Slice<T>(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice<T>(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend);
-        gru_value.output_value = hidden_t.mutable_data<T>();
-        gru_value.gate_value = gate_t.mutable_data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
+        gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width;
+        gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width;
+        gru_value.reset_output_value = batch_reset_hidden_prev_ptr +
+                                       bstart * batch_reset_hidden_prev_width;
 
         lite::x86::math::GRUUnitFunctor<TARGET(kX86), T>::compute(
             context,
diff --git a/lite/kernels/x86/reduce_op_function.h b/lite/kernels/x86/reduce_op_function.h
index 46e1248e070350ca82c73b639f8a924958460901..179a06164dc4aa73683ba8803bce1f7733bae141 100644
--- a/lite/kernels/x86/reduce_op_function.h
+++ b/lite/kernels/x86/reduce_op_function.h
@@ -63,7 +63,19 @@ void ReduceFunctor(const lite::Tensor& input,
     auto out = EigenScalar<T>::From(output);
     functor(&x, &out, reduce_dim);
   } else {
-    auto out = EigenTensor<T, (D - R_D)>::From(*output, output->dims());
+    std::vector<DDim::value_type> out_dims;
+    if (keep_dim) {
+      // Construct the squeezed dims.
+      const int kDelFlag = -2;
+      out_dims = output->dims().Vectorize();
+      for (size_t i = 0; i < dims.size(); ++i) {
+        out_dims[reduce_dim[i]] = kDelFlag;
+      }
+      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                     out_dims.end());
+    }
+    auto out = EigenTensor<T, (D - R_D)>::From(
+        *output, keep_dim ? DDim(out_dims) : output->dims());
     functor(&x, &out, reduce_dim);
   }
 }
diff --git a/lite/kernels/x86/softmax_compute.h b/lite/kernels/x86/softmax_compute.h
index 8063cf6566157bb5bd63449ce1655dd024983d1a..5a18a8022773682c0853a3592a9925f3a6015e83 100644
--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
@@ -55,24 +55,33 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& context = ctx_->As<X86Context>();
     CHECK(param.output);
     CHECK(param.x);
-    param.output->mutable_data<T>();
-    const int rank = param.x->dims().size();
+
+    auto* x = param.x;
+    auto* output = param.output;
+    output->mutable_data<T>();
+
+    const int rank = x->dims().size();
     const int axis = CanonicalAxis(param.axis, rank);
-    int axis_dim = param.x->dims()[axis];
-    const int n = SizeToAxis(axis, param.x->dims());
-    const int d = SizeFromAxis(axis, param.x->dims());
+    int axis_dim = x->dims()[axis];
+    if (rank == 2 && axis == 1) {
+      lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
+          context, axis_dim, x, output);
+    } else {
+      const int n = SizeToAxis(axis, x->dims());
+      const int d = SizeFromAxis(axis, x->dims());
 
-    DDim shape(std::vector<DDim::value_type>{n, d});
+      DDim x_dims = x->dims();
+      DDim out_dims = output->dims();
 
-    Tensor input_2d;
-    Tensor out_2d;
-    input_2d.ShareDataWith(*param.x);
-    input_2d.Resize(shape);
-    out_2d.ShareDataWith(*param.output);
-    out_2d.Resize(shape);
+      DDim shape_2d(std::vector<DDim::value_type>{n, d});
+      x->Resize(shape_2d);
+      output->Resize(shape_2d);
 
-    lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
-        context, axis_dim, &input_2d, &out_2d);
+      lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
+          context, axis_dim, x, output);
+      x->Resize(x_dims);
+      output->Resize(out_dims);
+    }
   }
 
   virtual ~SoftmaxCompute() = default;
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index 5d00570703f2caaf71ff5b5e1e6c3ad9e27eb6f2..42d132b418c9bf806d35ad2d8f302b190ce660e2 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -544,7 +544,6 @@ void SaveModelNaive(const std::string &model_dir,
                     const Scope &exec_scope,
                     const cpp::ProgramDesc &cpp_prog,
                     bool combined) {
-  MkDirRecur(model_dir);
   // Save program
   const std::string prog_path = model_dir + ".nb";
   naive_buffer::BinaryTable table;
@@ -571,7 +570,7 @@ void SaveModelNaive(const std::string &model_dir,
          paddle_version_length);
   paddle_version_table.Consume(paddle_version_length);
   paddle_version_table.AppendToFile(prog_path);
-  VLOG(4) << "paddle_version:" << paddle_version << std::endl;
+  VLOG(4) << "paddle_version:" << paddle_version;
 
   // Save topology_size(uint64) into file
   naive_buffer::BinaryTable topology_size_table;
@@ -586,7 +585,8 @@ void SaveModelNaive(const std::string &model_dir,
   // Save Params
   SaveCombinedParamsNaive(prog_path, exec_scope, cpp_prog);
 
-  LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully";
+  LOG(INFO) << "Save naive buffer model in '" << model_dir
+            << ".nb' successfully";
 }
 #endif
 
@@ -696,6 +696,13 @@ void LoadModelNaive(const std::string &model_dir,
   CHECK(scope);
   cpp_prog->ClearBlocks();
 
+  LOG(WARNING)
+      << "WARNING: MobileConfig::set_model_dir and "
+         "MobileConfig::set_model_buffer are deprecated APIs "
+         "and will be removed in latter release. \n"
+         "    MobileConfig::set_model_from_file(const std::string& model_file)"
+         " and MobileConfig::set_model_from_buffer(const std::string& "
+         "model_buffer) are recommended.";
   // Load model
   const std::string prog_path = model_dir + "/__model__.nb";
   naive_buffer::BinaryTable table;
@@ -786,11 +793,23 @@ void LoadModelNaiveFromFile(const std::string &filename,
 
   // (2)get opt version
   char opt_version[16];
-  const uint64_t paddle_version_length = 16 * sizeof(char);
+  const uint64_t opt_version_length = 16 * sizeof(char);
   ReadModelDataFromFile<char>(
-      opt_version, prog_path, &offset, paddle_version_length);
+      opt_version, prog_path, &offset, opt_version_length);
   VLOG(4) << "Opt_version:" << opt_version;
 
+  // check version, opt's version should be consistent with current Paddle-Lite
+  // version.
+  const std::string paddle_version = version();
+  const std::string opt_version_str = opt_version;
+  if (paddle_version != opt_version_str) {
+    LOG(WARNING) << "warning: the version of opt that transformed this model "
+                    "is not consistent with current Paddle-Lite version."
+                    "\n      version of opt:"
+                 << opt_version
+                 << "\n      version of current Paddle-Lite:" << paddle_version;
+  }
+
   // (3)get topo_size
   uint64_t topo_size;
   ReadModelDataFromFile<uint64_t>(
diff --git a/lite/operators/reduce_ops.cc b/lite/operators/reduce_ops.cc
index 3f0de174715a6fd718694fb31e9d7cb7c08cf2f9..e2cc56b416dd166e6b22a0c642907844ab964cc5 100644
--- a/lite/operators/reduce_ops.cc
+++ b/lite/operators/reduce_ops.cc
@@ -50,20 +50,18 @@ bool ReduceOp::InferShape() const {
   } else {
     size_t out_rank = keep_dim ? x_rank : x_rank - dims.size();
     std::vector<DDim::value_type> out_dims(out_rank);
-    if (keep_dim) {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        out_dims[dims[i]] = 1;
-      }
-    } else {
-      sort(dims.begin(), dims.end());
-      int dim_index = 0;
-      int out_index = 0;
-      for (size_t i = 0; i < x_rank; ++i) {
-        if (dims[dim_index] == static_cast<DDim::value_type>(i)) {
-          dim_index++;
-        } else {
-          out_dims[out_index++] = x_dims[i];
+    sort(dims.begin(), dims.end());
+    int dim_index = 0;
+    int out_index = 0;
+    for (size_t i = 0; i < x_rank; ++i) {
+      if (dim_index < dims.size() &&
+          dims[dim_index] == static_cast<DDim::value_type>(i)) {
+        if (keep_dim) {
+          out_dims[out_index++] = 1;
         }
+        dim_index++;
+      } else {
+        out_dims[out_index++] = x_dims[i];
       }
     }
     param_.output->Resize(out_dims);
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 3d6eb9eb8ace0dde1fba92af88ab3af20a87a2ed..c55f62c02977cec54b1ef679a038e06cb576b6b8 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -44,6 +44,7 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_sum_compute SRCS reduce_sum_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tests/kernels/reduce_sum_compute_test.cc b/lite/tests/kernels/reduce_sum_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cfe213750b1191c1ef8fe7fba1b1c1035c2ae42
--- /dev/null
+++ b/lite/tests/kernels/reduce_sum_compute_test.cc
@@ -0,0 +1,350 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+void reduce_sum_n(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_c(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_h(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = 0.0;
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_w(const float* src,
+                  float* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = 0.0;
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] += static_cast<float>(src[src_index]);
+        }
+      }
+    }
+  }
+}
+
+void reduce_sum_all(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  float sum = 0.0;
+  int src_index;
+  int n_id, c_id;
+  for (int n = 0; n < num_in; ++n) {
+    n_id = n * channel_in * height_in * width_in;
+    for (int c = 0; c < channel_in; ++c) {
+      c_id = c * height_in * width_in;
+      for (int h = 0; h < height_in; ++h) {
+        for (int w = 0; w < width_in; ++w) {
+          src_index = n_id + c_id + h * width_in + w;
+          sum = sum + src[src_index];
+        }
+      }
+    }
+  }
+  dst[0] = sum;
+}
+
+void reduce_sum_nc(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_sum_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_sum_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+void reduce_sum_ch(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_sum_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_sum_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+void reduce_sum_hw(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_sum_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_sum_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+class ReduceSumComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::vector<int> dim_{0};
+  bool keep_dim_ = false;
+  bool reduce_all_ = false;
+  DDim x_dims_{{3, 2, 3, 4}};
+
+ public:
+  ReduceSumComputeTester(const Place& place,
+                         const std::string& alias,
+                         std::vector<int> dim,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DDim x_dims)
+      : TestCase(place, alias),
+        dim_(dim),
+        keep_dim_(keep_dim),
+        reduce_all_(reduce_all),
+        x_dims_(x_dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindMutableTensor(input_);
+    const auto* x_data = x->data<float>();
+    auto* out = scope->NewTensor(output_);
+    auto x_rank = x_dims_.size();
+    if (!dim_.empty()) {
+      for (int i = 0; i < dim_.size(); i++) {
+        if (dim_[i] < 0) {
+          dim_[i] += x_rank;
+        }
+      }
+    }
+
+    sort(dim_.begin(), dim_.end());
+    std::vector<int64_t> out_dims;
+    if (reduce_all_) {
+      if (keep_dim_) {
+        out_dims.resize(x_rank);
+        for (int i = 0; i < x_rank; ++i) {
+          out_dims[i] = 1;
+        }
+      } else {
+        out_dims.push_back(1);
+      }
+    } else {
+      for (int i = 0; i < x_dims_.size(); i++) {
+        out_dims.push_back(x_dims_[i]);
+      }
+      if (keep_dim_) {
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = 1L;
+        }
+      } else {
+        int64_t kDelFlag = -2;
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = kDelFlag;
+        }
+        out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                       out_dims.end());
+      }
+    }
+    out->Resize(DDim(out_dims));
+
+    auto* out_data = out->mutable_data<float>();
+    int in_n = x_dims_[0];
+    int in_c = x_dims_[1];
+    int in_h = x_dims_[2];
+    int in_w = x_dims_[3];
+
+    if (reduce_all_) {
+      reduce_sum_all(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim_.size() == 1) {
+      switch (dim_[0]) {
+        case 0:
+          reduce_sum_n(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        case 1:
+          reduce_sum_c(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        case 2:
+          reduce_sum_h(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        case 3:
+          reduce_sum_w(x_data, out_data, in_n, in_c, in_h, in_w);
+          break;
+        default:
+          LOG(FATAL) << "error!!!";
+      }
+    } else if (dim_.size() == 2) {
+      if (dim_[0] == 0 && dim_[1] == 1) {
+        reduce_sum_nc(x_data, out_data, in_n, in_c, in_h, in_w);
+      } else if (dim_[0] == 1 && dim_[1] == 2) {
+        reduce_sum_ch(x_data, out_data, in_n, in_c, in_h, in_w);
+      } else if (dim_[0] == 2 && dim_[1] == 3) {
+        reduce_sum_hw(x_data, out_data, in_n, in_c, in_h, in_w);
+      } else {
+        LOG(FATAL) << "invalid dims_!!";
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("reduce_sum");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+    op_desc->SetAttr("dim", dim_);
+    op_desc->SetAttr("keep_dim", keep_dim_);
+    op_desc->SetAttr("reduce_all", reduce_all_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      data[i] = i * 1.0;
+    }
+    SetCommonTensor(input_, x_dims_, data.data());
+  }
+};
+
+void test_reduce_sum(Place place) {
+  std::vector<std::vector<int>> reduce_dim{
+      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 2}) {
+      for (auto h : {1, 3}) {
+        for (auto w : {1, 3}) {
+          for (bool keep_dim : {false, true}) {
+            for (bool reduce_all : {false, true}) {
+              for (auto dim : reduce_dim) {
+                auto x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
+                std::unique_ptr<arena::TestCase> tester(
+                    new ReduceSumComputeTester(
+                        place, "def", dim, keep_dim, reduce_all, x_dims));
+                arena::Arena arena(std::move(tester), place, 2e-5);
+                arena.TestPrecision();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(ReduceSum, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  test_reduce_sum(place);
+#endif
+  // #ifdef LITE_WITH_ARM
+  //  Place place(TARGET(kARM));
+  //  test_reduce_sum(place);
+  // #endif
+}
+
+}  // namespace lite
+}  // namespace paddle