diff --git a/doc/fluid/Paddle-Inference/.gitignore b/doc/fluid/Paddle-Inference/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f21889b9b421b16615bd52e1152e8abe61d6040c
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/.gitignore
@@ -0,0 +1,7 @@
+*~
+*.pyc
+*.DS_Store
+._*
+docs/_build/
+docs/api/
+docs/doxyoutput/
diff --git a/doc/fluid/Paddle-Inference/LICENSE b/doc/fluid/Paddle-Inference/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/doc/fluid/Paddle-Inference/README.md b/doc/fluid/Paddle-Inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5164b6f3e62345c76228569c56604055076d9e81
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/README.md
@@ -0,0 +1,20 @@
+# Paddle Inference Demos
+
+
+
+Paddle Inference为飞桨核心框架推理引擎。Paddle Inference功能特性丰富，性能优异，针对服务器端应用场景进行了深度的适配优化，做到高吞吐、低时延，保证了飞桨模型在服务器端即训即用，快速部署。
+
+
+为了能让广大用户快速的使用Paddle Inference进行部署应用，我们在此Repo中提供了C++、Python的使用样例。
+
+
+**在这个repo中我们会假设您已经对Paddle Inference有了一定的了解。**
+
+**如果您刚刚接触Paddle Inference不久，建议您[访问这里](https://paddle-inference.readthedocs.io/en/latest/#)对Paddle Inference做一个初步的认识。**
+
+
+## 测试样例
+
+1） 在python目录中，我们通过真实输入的方式罗列了一系列的测试样例，其中包括图像的分类，分割，检测，以及NLP的Ernie/Bert等Python使用样例，同时也包含Paddle-TRT， 多线程的使用样例。
+
+2） 在c++目录中，我们通过单测方式展现了一系列的测试样例，其中包括图像的分类，分割，检测，以及NLP的Ernie/Bert等C++使用样例，同时也包含Paddle-TRT， 多线程的使用样例。
diff --git a/doc/fluid/Paddle-Inference/c++/LIC2020/CMakeLists.txt b/doc/fluid/Paddle-Inference/c++/LIC2020/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2206d484e5c271afc8bc00d06aaaee05b88c51fc
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/LIC2020/CMakeLists.txt
@@ -0,0 +1,96 @@
+cmake_minimum_required(VERSION 3.0)
+project(cpp_inference_demo CXX C)
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
+
+
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(CMAKE_STATIC_LIBRARY_PREFIX "")
+message("flags" ${CMAKE_CXX_FLAGS})
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+if (USE_TENSORRT AND WITH_GPU)
+      include_directories("${TENSORRT_ROOT}/include")
+      link_directories("${TENSORRT_ROOT}/lib")
+endif()
+
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif()
+
+# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
+if(WITH_STATIC_LIB)
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    glog gflags protobuf z xxhash
+    ${EXTERNAL_LIB})
+
+if(WITH_GPU)
+  if (USE_TENSORRT)
+    set(DEPS ${DEPS}
+        ${TENSORRT_ROOT}/lib/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS}
+        ${TENSORRT_ROOT}/lib/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcublas${CMAKE_SHARED_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX} )
+endif()
+
+target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/doc/fluid/Paddle-Inference/c++/LIC2020/README.md b/doc/fluid/Paddle-Inference/c++/LIC2020/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..35594392e869011eb9c8b75709a97c09f8522ae4
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/LIC2020/README.md
@@ -0,0 +1,52 @@
+## 运行C++ LIC2020关系抽取 demo
+
+该工程是[2020语言与智能技术竞赛：关系抽取任务](https://aistudio.baidu.com/aistudio/competition/detail/31)竞赛提供的基线模型的c++预测demo，基线模型的训练相关信息可参考[LIC2020-关系抽取任务基线系统](https://aistudio.baidu.com/aistudio/projectdetail/357344)。
+
+### 一：获取基线模型
+
+点击[链接](https://paddle-inference-dist.bj.bcebos.com/lic_model.tgz)下载模型，如果你想获取更多的**模型训练信息**，请访问[LIC2020-关系抽取任务基线系统](https://aistudio.baidu.com/aistudio/projectdetail/357344)。
+
+### 二：**样例编译**
+
+文件`demo.cc` 为预测的样例程序（程序中的输入为固定值，如果您有其他方式进行数据读取的需求，需要对程序进行一定的修改）。
+文件`CMakeLists.txt` 为编译构建文件。  
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+编译demo样例，我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 根据预编译库中的version.txt信息判断是否将以下三个标记打开
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=OFF
+
+# 配置预测库的根目录
+LIB_DIR=${YOUR_LIB_DIR}/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。
+CUDNN_LIB=/usr/local/cudnn/lib64
+CUDA_LIB=/usr/local/cuda/lib64
+# TENSORRT_ROOT=/usr/local/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./demo -model_file ${LIC_MODEL_PATH}/model --params_file ${LIC_MODEL_PATH}/params
+```
+
+运行结束后，程序会将模型输出个数打印到屏幕，说明运行成功。
+
+### 更多链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
diff --git a/doc/fluid/Paddle-Inference/c++/LIC2020/demo.cc b/doc/fluid/Paddle-Inference/c++/LIC2020/demo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e15336929f1f8482281ba87aa759d738ad965643
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/LIC2020/demo.cc
@@ -0,0 +1,157 @@
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <numeric>
+
+#include "paddle/include/paddle_inference_api.h"
+
+using paddle::AnalysisConfig;
+
+DEFINE_string(model_file, "", "Directory of the inference model.");
+DEFINE_string(params_file, "", "Directory of the inference model.");
+DEFINE_string(model_dir, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Directory of the inference model.");
+DEFINE_int32(seq_len,
+             128,
+             "sequence length, should less than or equal to 512.");
+DEFINE_bool(use_gpu, true, "enable gpu");
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
+  AnalysisConfig config;
+  if (FLAGS_model_dir != "") {
+    config.SetModel(FLAGS_model_dir);
+  } else {
+    config.SetModel(FLAGS_model_file, FLAGS_params_file);
+  }
+  if (FLAGS_use_gpu) {
+    config.EnableUseGpu(100, 0);
+  }
+  // We use ZeroCopy, so we set config->SwitchUseFeedFetchOps(false)
+  config.SwitchUseFeedFetchOps(false);
+  return CreatePaddlePredictor(config);
+}
+
+template <typename Dtype>
+std::vector<Dtype> PrepareInput(const std::vector<int>& shape,
+                                int word_size = 18000);
+
+template <>
+std::vector<float> PrepareInput(const std::vector<int>& shape, int word_size) {
+  int count =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+  std::vector<float> datas(count, 0);
+  for (int i = 0; i < count; ++i) {
+    datas[i] = i % 2 ? 0.f : 1.f;
+  }
+  return datas;
+}
+
+template <>
+std::vector<int64_t> PrepareInput(const std::vector<int>& shape,
+                                  int word_size) {
+  int count =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+  std::vector<int64_t> datas(count, 0);
+  for (int i = 0; i < count; ++i) {
+    datas[i] = (i + 13) % word_size == 0 ? 1 : (i + 13) % word_size;
+  }
+  return datas;
+}
+
+void Run(paddle::PaddlePredictor* predictor,
+         std::vector<float>* out_data_0,
+         std::vector<int64_t>* out_data_1,
+         std::vector<int64_t>* out_data_2) {
+  const int word_size_0 = 18000;
+  const int word_size_1 = 2;
+  const int word_size_2 = 513;
+
+  const int batch_size = FLAGS_batch_size;
+  const int seq_len = FLAGS_seq_len;
+
+  auto input_names = predictor->GetInputNames();
+  std::vector<int> shape_seq{batch_size, seq_len, 1};
+  std::vector<int> shape_batch{batch_size};
+  std::vector<int> shape_s{batch_size, seq_len};
+
+#define INPUT_EMB(num)                                                 \
+  auto input_##num = predictor->GetInputTensor(input_names[num]);      \
+  auto data_##num = PrepareInput<int64_t>(shape_seq, word_size_##num); \
+  input_##num->Reshape(shape_seq);                                     \
+  input_##num->copy_from_cpu(data_##num.data())
+
+  INPUT_EMB(0);
+  INPUT_EMB(1);
+  INPUT_EMB(2);
+
+#undef INPUT_EMB
+
+  auto input_3 = predictor->GetInputTensor(input_names[3]);
+  auto data_3 = PrepareInput<float>(shape_seq);
+  input_3->Reshape(shape_seq);
+  input_3->copy_from_cpu(data_3.data());
+
+  auto input_4 = predictor->GetInputTensor(input_names[4]);
+  auto data_4 = PrepareInput<int64_t>(shape_batch);
+  input_4->Reshape(shape_batch);
+  input_4->copy_from_cpu(data_4.data());
+
+#define INPUT_5_or_6(num)                                         \
+  auto input_##num = predictor->GetInputTensor(input_names[num]); \
+  auto data_##num = PrepareInput<int64_t>(shape_s, 1);            \
+  input_##num->Reshape(shape_s);                                  \
+  input_##num->copy_from_cpu(data_##num.data())
+
+  INPUT_5_or_6(5);
+  INPUT_5_or_6(6);
+
+#undef INPUT_5_or_6
+
+  CHECK(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  // there is three output of lic2020 baseline model
+
+#define OUTPUT(num)                                                  \
+  auto output_##num = predictor->GetOutputTensor(output_names[num]); \
+  std::vector<int> output_shape_##num = output_##num->shape();       \
+  int out_num_##num = std::accumulate(output_shape_##num.begin(),    \
+                                      output_shape_##num.end(),      \
+                                      1,                             \
+                                      std::multiplies<int>());       \
+  out_data_##num->resize(out_num_##num);                             \
+  output_##num->copy_to_cpu(out_data_##num->data())
+
+  OUTPUT(0);
+  OUTPUT(1);
+  OUTPUT(2);
+
+#undef OUTPUT
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  auto predictor = CreatePredictor();
+
+  std::vector<float> out_data_0;
+  std::vector<int64_t> out_data_1;
+  std::vector<int64_t> out_data_2;
+  Run(predictor.get(), &out_data_0, &out_data_1, &out_data_2);
+
+  LOG(INFO) << "output0 num is " << out_data_0.size();
+  LOG(INFO) << "output1 num is " << out_data_1.size();
+  LOG(INFO) << "output2 num is " << out_data_2.size();
+  return 0;
+}
diff --git a/doc/fluid/Paddle-Inference/c++/LIC2020/index.html b/doc/fluid/Paddle-Inference/c++/LIC2020/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..4c61dd00171a60aa318afa72297d02a51c77c4ab
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/LIC2020/index.html
@@ -0,0 +1,116 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+## 运行C++ LIC2020关系抽取 demo
+
+该工程是[2020语言与智能技术竞赛：关系抽取任务](https://aistudio.baidu.com/aistudio/competition/detail/31)竞赛提供的基线模型的c++预测demo，基线模型的训练相关信息可参考[LIC2020-关系抽取任务基线系统](https://aistudio.baidu.com/aistudio/projectdetail/357344)。
+
+### 一：获取基线模型
+
+点击[链接](https://paddle-inference-dist.bj.bcebos.com/lic_model.tgz)下载模型，如果你想获取更多的**模型训练信息**，请访问[LIC2020-关系抽取任务基线系统](https://aistudio.baidu.com/aistudio/projectdetail/357344)。
+
+### 二：**样例编译**
+
+文件`demo.cc` 为预测的样例程序（程序中的输入为固定值，如果您有其他方式进行数据读取的需求，需要对程序进行一定的修改）。
+文件`CMakeLists.txt` 为编译构建文件。  
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+编译demo样例，我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 根据预编译库中的version.txt信息判断是否将以下三个标记打开
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=OFF
+
+# 配置预测库的根目录
+LIB_DIR=${YOUR_LIB_DIR}/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。
+CUDNN_LIB=/usr/local/cudnn/lib64
+CUDA_LIB=/usr/local/cuda/lib64
+# TENSORRT_ROOT=/usr/local/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./demo -model_file ${LIC_MODEL_PATH}/model --params_file ${LIC_MODEL_PATH}/params
+```
+
+运行结束后，程序会将模型输出个数打印到屏幕，说明运行成功。
+
+### 更多链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/c++/LIC2020/run_impl.sh b/doc/fluid/Paddle-Inference/c++/LIC2020/run_impl.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e76365f85b068fb08080a48d715d0313d5a1e201
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/LIC2020/run_impl.sh
@@ -0,0 +1,29 @@
+work_path=$(dirname $(readlink -f $0))
+
+mkdir -p build
+cd build
+rm -rf *
+
+# same with the demo.cc
+DEMO_NAME=demo
+
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=OFF
+
+LIB_DIR=${work_path}/fluid_inference_install_dir
+CUDNN_LIB=/usr/local/cudnn/lib64
+CUDA_LIB=/usr/local/cuda/lib64
+# TENSORRT_ROOT=/usr/local/TensorRT-6.0.1.5
+
+cmake .. -DPADDLE_LIB=${LIB_DIR} \
+  -DWITH_MKL=${WITH_MKL} \
+  -DDEMO_NAME=${DEMO_NAME} \
+  -DWITH_GPU=${WITH_GPU} \
+  -DWITH_STATIC_LIB=OFF \
+  -DUSE_TENSORRT=${USE_TENSORRT} \
+  -DCUDNN_LIB=${CUDNN_LIB} \
+  -DCUDA_LIB=${CUDA_LIB} \
+  -DTENSORRT_ROOT=${TENSORRT_ROOT}
+
+make -j
diff --git a/doc/fluid/Paddle-Inference/c++/README.md b/doc/fluid/Paddle-Inference/c++/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d50240ecf59d6e1d00118afdd8d057d0fd7c125
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/README.md
@@ -0,0 +1,43 @@
+# C++ 预测样例
+
+**如果您看到这个目录，我们会假设您已经对Paddle Inference有了一定的了解。**
+
+**如果您刚刚接触Paddle Inference不久，建议您[访问这里](https://paddle-inference.readthedocs.io/en/latest/#)对Paddle Inference做一个初步的认识。**
+
+这个目录包含了图像中使用的分类，检测，以及NLP中Ernie/Bert模型测试样例，同时也包含了Paddle-TRT，多线程等测试样例。
+
+为了能够顺利运行样例，请您在环境中准备Paddle Inference C++预编译库。
+
+**一：获取编译库：**
+
+- [官网下载](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)。
+- 自行编译获取。
+
+**二：预编译lib目录结构介绍：**
+
+进入预编译库，目录结构为：
+
+```
+├── CMakeCache.txt
+├── paddle
+├── third_party
+└── version.txt
+```
+
+其中`paddle`目录包含了预编译库的头文件以及lib文件。  
+`third_party`包含了第三方依赖库的头文件以及lib文件。
+
+`version.txt`包含了lib的相关描述信息，包括：
+
+	```
+	GIT COMMIT ID: 06897f7c4ee41295e6e9a0af2a68800a27804f6c
+	WITH_MKL: ON         # 是否带MKL
+	WITH_MKLDNN: OFF     # 是否带MKLDNN
+	WITH_GPU: ON         # 是否支持GPU
+	CUDA version: 10.1   # CUDA的版本
+	CUDNN version: v7。  # CUDNN版本
+	WITH_TENSORRT: ON    # 是否带TRT
+	```
+
+
+有了预编译库后我们开始进入各个目录进行样例测试吧～
diff --git a/doc/fluid/Paddle-Inference/c++/index.html b/doc/fluid/Paddle-Inference/c++/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..c28e56278441bca26ba919ebc703b5ad247ea50d
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/index.html
@@ -0,0 +1,107 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# C++ 预测样例
+
+**如果您看到这个目录，我们会假设您已经对Paddle Inference有了一定的了解。**
+
+**如果您刚刚接触Paddle Inference不久，建议您[访问这里](https://paddle-inference.readthedocs.io/en/latest/#)对Paddle Inference做一个初步的认识。**
+
+这个目录包含了图像中使用的分类，检测，以及NLP中Ernie/Bert模型测试样例，同时也包含了Paddle-TRT，多线程等测试样例。
+
+为了能够顺利运行样例，请您在环境中准备Paddle Inference C++预编译库。
+
+**一：获取编译库：**
+
+- [官网下载](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)。
+- 自行编译获取。
+
+**二：预编译lib目录结构介绍：**
+
+进入预编译库，目录结构为：
+
+```
+├── CMakeCache.txt
+├── paddle
+├── third_party
+└── version.txt
+```
+
+其中`paddle`目录包含了预编译库的头文件以及lib文件。  
+`third_party`包含了第三方依赖库的头文件以及lib文件。
+
+`version.txt`包含了lib的相关描述信息，包括：
+
+	```
+	GIT COMMIT ID: 06897f7c4ee41295e6e9a0af2a68800a27804f6c
+	WITH_MKL: ON         # 是否带MKL
+	WITH_MKLDNN: OFF     # 是否带MKLDNN
+	WITH_GPU: ON         # 是否支持GPU
+	CUDA version: 10.1   # CUDA的版本
+	CUDNN version: v7。  # CUDNN版本
+	WITH_TENSORRT: ON    # 是否带TRT
+	```
+
+
+有了预编译库后我们开始进入各个目录进行样例测试吧～
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/c++/paddle-trt/CMakeLists.txt b/doc/fluid/Paddle-Inference/c++/paddle-trt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a1c02c9696190b3701719dc594e0a1353bed0510
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/paddle-trt/CMakeLists.txt
@@ -0,0 +1,95 @@
+project(cpp_inference_demo CXX C)
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
+
+
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -g")
+set(CMAKE_STATIC_LIBRARY_PREFIX "")
+message("flags" ${CMAKE_CXX_FLAGS})
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+if (USE_TENSORRT AND WITH_GPU)
+      include_directories("${TENSORRT_ROOT}/include")
+      link_directories("${TENSORRT_ROOT}/lib")
+endif()
+
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif()
+
+# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
+if(WITH_STATIC_LIB)
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    glog gflags protobuf z xxhash
+    ${EXTERNAL_LIB})
+
+if(WITH_GPU)
+  if (USE_TENSORRT)
+    set(DEPS ${DEPS}
+        ${TENSORRT_ROOT}/lib/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS}
+        ${TENSORRT_ROOT}/lib/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcublas${CMAKE_SHARED_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX} )
+endif()
+
+target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/doc/fluid/Paddle-Inference/c++/paddle-trt/README.md b/doc/fluid/Paddle-Inference/c++/paddle-trt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a2576ce4eeafe9b220ea8f9e424eabb0902aee5a
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/paddle-trt/README.md
@@ -0,0 +1,205 @@
+## 使用Paddle-TRT进行ResNet50图像分类样例
+
+该文档为使用Paddle-TRT预测在ResNet50分类模型上的实践demo。如果您刚接触Paddle-TRT，推荐先访问[这里](https://paddle-inference.readthedocs.io/en/latest/optimize/paddle_trt.html)对Paddle-TRT有个初步认识。
+
+本目录下，
+
+- `trt_fp32_test.cc` 为使用Paddle-TRT进行FP32精度预测的样例程序源文件（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。
+- `trt_gen_calib_table_test.cc` 为离线量化校准中，产出量化校准表的样例程序源文件。
+- `trt_int8_test.cc` 为使用Paddle-TRT进行Int8精度预测的样例程序源文件，根据传入布尔类型参数`use_calib`为`true`或`false`，可以进行加载离线量化校准表进行Int8预测，或加载PaddleSlim量化产出的Int8模型进行预测。
+- `CMakeLists.txt` 为编译构建文件。  
+- `run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+### 获取模型
+首先，我们从下列链接下载所需模型：
+
+[ResNet50 FP32模型](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50.tar.gz)
+
+[ResNet50 PaddleSlim量化模型](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50_quant.tar.gz)
+
+其中，FP32模型用于FP32精度预测，以及Int8离线校准预测；量化模型由模型压缩工具库PaddleSlim产出，PaddleSlim模型量化相关信息可以参考[这里](https://paddlepaddle.github.io/PaddleSlim/quick_start/quant_aware_tutorial.html)。使用Paddle-TRT进行Int8量化预测的介绍可以参考[这里](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/docs/optimize/paddle_trt.rst#int8%E9%87%8F%E5%8C%96%E9%A2%84%E6%B5%8B)。
+
+### 一、使用TRT FP32精度预测
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 选择使用fp32预测的demo
+DEMO_NAME=trt_fp32_test
+
+# 本节中，我们使用了TensorRT，需要将USE_TENSORRT打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=ON
+
+# 配置预测库的根目录
+LIB_DIR=/paddle/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。请注意CUDA和CUDNN需要设置到lib64一层，而TensorRT是设置到根目录一层
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.6_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./trt_fp32_test --model_file=../ResNet50/model --params_file=../ResNet50/params
+```
+
+运行结束后，程序会将模型预测输出的前20个结果打印到屏幕，说明运行成功。
+
+### 二、使用TRT Int8离线量化预测
+
+使用TRT Int8离线量化预测分为两步：生成量化校准表，以及加载校准表执行Int8预测。需要注意的是TRT Int8离线量化预测使用的仍然是ResNet50 FP32 模型，是通过校准表中包含的量化scale在运行时将FP32转为Int8从而加速预测的。
+
+#### 生成量化校准表
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 选择生成量化校准表的demo
+DEMO_NAME=trt_gen_calib_table_test
+
+# 本节中，我们使用了TensorRT，需要将USE_TENSORRT打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=ON
+
+# 配置预测库的根目录
+LIB_DIR=/paddle/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。请注意CUDA和CUDNN需要设置到lib64一层，而TensorRT是设置到根目录一层
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.6_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./trt_gen_calib_table_test --model_file=../ResNet50/model --params_file=../ResNet50/params
+```
+
+运行结束后，模型文件夹`ResNet50`下的`_opt_cache`文件夹下会多出一个名字为`trt_calib_*`的文件，即校准表。
+
+#### 加载校准表执行Int8预测
+
+再次修改`run_impl.sh`，换成执行Int8预测的demo：
+
+```shell
+# 选择执行Int8预测的demo
+DEMO_NAME=trt_int8_test
+
+# 本节中，我们使用了TensorRT，需要将USE_TENSORRT打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=ON
+
+# 配置预测库的根目录
+LIB_DIR=/paddle/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。请注意CUDA和CUDNN需要设置到lib64一层，而TensorRT是设置到根目录一层
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.6_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例，注意此处要将use_calib配置为true
+./trt_int8_test --model_file=../ResNet50/model --params_file=../ResNet50/params --use_calib=true
+```
+
+运行结束后，程序会将模型预测输出的前20个结果打印到屏幕，说明运行成功。
+
+**Note**
+
+观察`trt_gen_calib_table_test`和`trt_int8_test`的代码可以发现，生成校准表和加载校准表进行Int8预测的TensorRT配置是相同的，都是
+
+```c++
+config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, 5, AnalysisConfig::Precision::kInt8, false, true /*use_calib*/);
+```
+
+Paddle-TRT判断是生成还是加载校准表的条件是模型目录下`_opt_cache`文件夹里是否有一个名字为`trt_calib_*`的与当前模型对应的校准表文件。在运行时为了防止混淆生成与加载过程，可以通过观察运行log来区分。
+
+生成校准表的log：
+
+```
+I0623 08:40:49.386909 107053 tensorrt_engine_op.h:159] This process is generating calibration table for Paddle TRT int8...
+I0623 08:40:49.387279 107057 tensorrt_engine_op.h:352] Prepare TRT engine (Optimize model structure, Select OP kernel etc). This process may cost a lot of time.
+I0623 08:41:13.784473 107053 analysis_predictor.cc:791] Wait for calib threads done.
+I0623 08:41:14.419198 107053 analysis_predictor.cc:793] Generating TRT Calibration table data, this may cost a lot of time...
+```
+
+加载校准表预测的log：
+
+```
+I0623 08:40:27.217701 107040 tensorrt_subgraph_pass.cc:258] RUN Paddle TRT int8 calibration mode...
+I0623 08:40:27.217834 107040 tensorrt_subgraph_pass.cc:321] Prepare TRT engine (Optimize model structure, Select OP kernel etc). This process may cost a lot of time.
+```
+
+### 三、使用TRT 加载PaddleSlim Int8量化模型预测
+
+这里，我们使用前面下载的ResNet50 PaddleSlim量化模型。与加载离线量化校准表执行Int8预测的区别是，PaddleSlim量化模型已经将scale保存在模型op的属性中，这里我们就不再需要校准表了，所以在运行样例时将`use_calib`配置为false。
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 选择使用Int8预测的demo
+DEMO_NAME=trt_int8_test
+
+# 本节中，我们使用了TensorRT，需要将USE_TENSORRT打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=ON
+
+# 配置预测库的根目录
+LIB_DIR=/paddle/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。请注意CUDA和CUDNN需要设置到lib64一层，而TensorRT是设置到根目录一层
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.6_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例，注意此处要将use_calib配置为false
+./trt_int8_test --model_file=../ResNet50_quant/model --params_file=../ResNet50_quant/params --use_calib=false
+```
+
+运行结束后，程序会将模型预测输出的前20个结果打印到屏幕，说明运行成功。
+
+### 更多链接
+- [Paddle Inference使用Quick Start！](https://paddle-inference.readthedocs.io/en/latest/introduction/quick_start.html)
+- [Paddle Inference C++ Api使用](https://paddle-inference.readthedocs.io/en/latest/user_guides/cxx_api.html)
+- [Paddle Inference Python Api使用](https://paddle-inference.readthedocs.io/en/latest/user_guides/inference_python_api.html)
diff --git a/doc/fluid/Paddle-Inference/c++/paddle-trt/index.html b/doc/fluid/Paddle-Inference/c++/paddle-trt/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..92270bfd1df877c8efca925f9705718755f9cc8e
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/paddle-trt/index.html
@@ -0,0 +1,269 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+## 使用Paddle-TRT进行ResNet50图像分类样例
+
+该文档为使用Paddle-TRT预测在ResNet50分类模型上的实践demo。如果您刚接触Paddle-TRT，推荐先访问[这里](https://paddle-inference.readthedocs.io/en/latest/optimize/paddle_trt.html)对Paddle-TRT有个初步认识。
+
+本目录下，
+
+- `trt_fp32_test.cc` 为使用Paddle-TRT进行FP32精度预测的样例程序源文件（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。
+- `trt_gen_calib_table_test.cc` 为离线量化校准中，产出量化校准表的样例程序源文件。
+- `trt_int8_test.cc` 为使用Paddle-TRT进行Int8精度预测的样例程序源文件，根据传入布尔类型参数`use_calib`为`true`或`false`，可以进行加载离线量化校准表进行Int8预测，或加载PaddleSlim量化产出的Int8模型进行预测。
+- `CMakeLists.txt` 为编译构建文件。  
+- `run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+### 获取模型
+首先，我们从下列链接下载所需模型：
+
+[ResNet50 FP32模型](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50.tar.gz)
+
+[ResNet50 PaddleSlim量化模型](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50_quant.tar.gz)
+
+其中，FP32模型用于FP32精度预测，以及Int8离线校准预测；量化模型由模型压缩工具库PaddleSlim产出，PaddleSlim模型量化相关信息可以参考[这里](https://paddlepaddle.github.io/PaddleSlim/quick_start/quant_aware_tutorial.html)。使用Paddle-TRT进行Int8量化预测的介绍可以参考[这里](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/docs/optimize/paddle_trt.rst#int8%E9%87%8F%E5%8C%96%E9%A2%84%E6%B5%8B)。
+
+### 一、使用TRT FP32精度预测
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 选择使用fp32预测的demo
+DEMO_NAME=trt_fp32_test
+
+# 本节中，我们使用了TensorRT，需要将USE_TENSORRT打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=ON
+
+# 配置预测库的根目录
+LIB_DIR=/paddle/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。请注意CUDA和CUDNN需要设置到lib64一层，而TensorRT是设置到根目录一层
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.6_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./trt_fp32_test --model_file=../ResNet50/model --params_file=../ResNet50/params
+```
+
+运行结束后，程序会将模型预测输出的前20个结果打印到屏幕，说明运行成功。
+
+### 二、使用TRT Int8离线量化预测
+
+使用TRT Int8离线量化预测分为两步：生成量化校准表，以及加载校准表执行Int8预测。需要注意的是TRT Int8离线量化预测使用的仍然是ResNet50 FP32 模型，是通过校准表中包含的量化scale在运行时将FP32转为Int8从而加速预测的。
+
+#### 生成量化校准表
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 选择生成量化校准表的demo
+DEMO_NAME=trt_gen_calib_table_test
+
+# 本节中，我们使用了TensorRT，需要将USE_TENSORRT打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=ON
+
+# 配置预测库的根目录
+LIB_DIR=/paddle/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。请注意CUDA和CUDNN需要设置到lib64一层，而TensorRT是设置到根目录一层
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.6_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./trt_gen_calib_table_test --model_file=../ResNet50/model --params_file=../ResNet50/params
+```
+
+运行结束后，模型文件夹`ResNet50`下的`_opt_cache`文件夹下会多出一个名字为`trt_calib_*`的文件，即校准表。
+
+#### 加载校准表执行Int8预测
+
+再次修改`run_impl.sh`，换成执行Int8预测的demo：
+
+```shell
+# 选择执行Int8预测的demo
+DEMO_NAME=trt_int8_test
+
+# 本节中，我们使用了TensorRT，需要将USE_TENSORRT打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=ON
+
+# 配置预测库的根目录
+LIB_DIR=/paddle/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。请注意CUDA和CUDNN需要设置到lib64一层，而TensorRT是设置到根目录一层
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.6_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例，注意此处要将use_calib配置为true
+./trt_int8_test --model_file=../ResNet50/model --params_file=../ResNet50/params --use_calib=true
+```
+
+运行结束后，程序会将模型预测输出的前20个结果打印到屏幕，说明运行成功。
+
+**Note**
+
+观察`trt_gen_calib_table_test`和`trt_int8_test`的代码可以发现，生成校准表和加载校准表进行Int8预测的TensorRT配置是相同的，都是
+
+```c++
+config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, 5, AnalysisConfig::Precision::kInt8, false, true /*use_calib*/);
+```
+
+Paddle-TRT判断是生成还是加载校准表的条件是模型目录下`_opt_cache`文件夹里是否有一个名字为`trt_calib_*`的与当前模型对应的校准表文件。在运行时为了防止混淆生成与加载过程，可以通过观察运行log来区分。
+
+生成校准表的log：
+
+```
+I0623 08:40:49.386909 107053 tensorrt_engine_op.h:159] This process is generating calibration table for Paddle TRT int8...
+I0623 08:40:49.387279 107057 tensorrt_engine_op.h:352] Prepare TRT engine (Optimize model structure, Select OP kernel etc). This process may cost a lot of time.
+I0623 08:41:13.784473 107053 analysis_predictor.cc:791] Wait for calib threads done.
+I0623 08:41:14.419198 107053 analysis_predictor.cc:793] Generating TRT Calibration table data, this may cost a lot of time...
+```
+
+加载校准表预测的log：
+
+```
+I0623 08:40:27.217701 107040 tensorrt_subgraph_pass.cc:258] RUN Paddle TRT int8 calibration mode...
+I0623 08:40:27.217834 107040 tensorrt_subgraph_pass.cc:321] Prepare TRT engine (Optimize model structure, Select OP kernel etc). This process may cost a lot of time.
+```
+
+### 三、使用TRT 加载PaddleSlim Int8量化模型预测
+
+这里，我们使用前面下载的ResNet50 PaddleSlim量化模型。与加载离线量化校准表执行Int8预测的区别是，PaddleSlim量化模型已经将scale保存在模型op的属性中，这里我们就不再需要校准表了，所以在运行样例时将`use_calib`配置为false。
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 选择使用Int8预测的demo
+DEMO_NAME=trt_int8_test
+
+# 本节中，我们使用了TensorRT，需要将USE_TENSORRT打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=ON
+
+# 配置预测库的根目录
+LIB_DIR=/paddle/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。请注意CUDA和CUDNN需要设置到lib64一层，而TensorRT是设置到根目录一层
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.6_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例，注意此处要将use_calib配置为false
+./trt_int8_test --model_file=../ResNet50_quant/model --params_file=../ResNet50_quant/params --use_calib=false
+```
+
+运行结束后，程序会将模型预测输出的前20个结果打印到屏幕，说明运行成功。
+
+### 更多链接
+- [Paddle Inference使用Quick Start！](https://paddle-inference.readthedocs.io/en/latest/introduction/quick_start.html)
+- [Paddle Inference C++ Api使用](https://paddle-inference.readthedocs.io/en/latest/user_guides/cxx_api.html)
+- [Paddle Inference Python Api使用](https://paddle-inference.readthedocs.io/en/latest/user_guides/inference_python_api.html)
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/c++/paddle-trt/run_impl.sh b/doc/fluid/Paddle-Inference/c++/paddle-trt/run_impl.sh
new file mode 100644
index 0000000000000000000000000000000000000000..13c4a18d22203e89819c3e579ce13bc492fbde52
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/paddle-trt/run_impl.sh
@@ -0,0 +1,27 @@
+mkdir -p build
+cd build
+rm -rf *
+
+# same with the resnet50_test.cc
+DEMO_NAME=resnet50_trt_test
+
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=ON
+
+LIB_DIR=/paddle/build/fluid_inference_install_dir
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.6_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+
+cmake .. -DPADDLE_LIB=${LIB_DIR} \
+  -DWITH_MKL=${WITH_MKL} \
+  -DDEMO_NAME=${DEMO_NAME} \
+  -DWITH_GPU=${WITH_GPU} \
+  -DWITH_STATIC_LIB=OFF \
+  -DUSE_TENSORRT=${USE_TENSORRT} \
+  -DCUDNN_LIB=${CUDNN_LIB} \
+  -DCUDA_LIB=${CUDA_LIB} \
+  -DTENSORRT_ROOT=${TENSORRT_ROOT}
+
+make -j
diff --git a/doc/fluid/Paddle-Inference/c++/paddle-trt/trt_fp32_test.cc b/doc/fluid/Paddle-Inference/c++/paddle-trt/trt_fp32_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30e824ebde23424af391db1b0cf2c8717499d78e
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/paddle-trt/trt_fp32_test.cc
@@ -0,0 +1,78 @@
+#include <numeric>
+#include <iostream>
+#include <memory>
+#include <chrono>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "paddle/include/paddle_inference_api.h"
+
+using paddle::AnalysisConfig;
+
+DEFINE_string(model_file, "", "Path of the inference model file.");
+DEFINE_string(params_file, "", "Path of the inference params file.");
+DEFINE_string(model_dir, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Batch size.");
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
+  AnalysisConfig config;
+  if (FLAGS_model_dir != "") {
+    config.SetModel(FLAGS_model_dir);
+  } else {
+    config.SetModel(FLAGS_model_file,
+                    FLAGS_params_file);
+  }
+  config.EnableUseGpu(500, 0);
+  // We use ZeroCopy, so we set config.SwitchUseFeedFetchOps(false) here.
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, 5, AnalysisConfig::Precision::kFloat32, false, false);
+  return CreatePaddlePredictor(config);
+}
+
+void run(paddle::PaddlePredictor *predictor,
+         const std::vector<float>& input,
+         const std::vector<int>& input_shape, 
+         std::vector<float> *out_data) {
+  int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape(input_shape);
+  input_t->copy_from_cpu(input.data());
+
+  CHECK(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  // there is only one output of Resnet50
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+ 
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  auto predictor = CreatePredictor();
+  std::vector<int> input_shape = {FLAGS_batch_size, 3, 224, 224}; 
+  // Init input as 1.0 here for example. You can also load preprocessed real pictures to vectors as input.
+  std::vector<float> input_data(FLAGS_batch_size * 3 * 224 * 224, 1.0); 
+  std::vector<float> out_data;
+  run(predictor.get(), input_data, input_shape, &out_data);
+  // Print first 20 outputs
+  for (int i = 0; i < 20; i++) {
+    LOG(INFO) << out_data[i] << std::endl;
+  }
+  return 0;
+}
diff --git a/doc/fluid/Paddle-Inference/c++/paddle-trt/trt_gen_calib_table_test.cc b/doc/fluid/Paddle-Inference/c++/paddle-trt/trt_gen_calib_table_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83aa69f752541c21a4494faf023dd5abc83793de
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/paddle-trt/trt_gen_calib_table_test.cc
@@ -0,0 +1,79 @@
+#include <numeric>
+#include <iostream>
+#include <memory>
+#include <chrono>
+#include <random>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "paddle/include/paddle_inference_api.h"
+
+using paddle::AnalysisConfig;
+
+DEFINE_string(model_file, "", "Path of the inference model file.");
+DEFINE_string(params_file, "", "Path of the inference params file.");
+DEFINE_string(model_dir, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Batch size.");
+
+float Random(float low, float high) {
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(low, high);
+  return dist(mt);
+}
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
+  AnalysisConfig config;
+  if (FLAGS_model_dir != "") {
+    config.SetModel(FLAGS_model_dir);
+  } else {
+    config.SetModel(FLAGS_model_file,
+                    FLAGS_params_file);
+  }
+  config.EnableUseGpu(500, 0);
+  // We use ZeroCopy, so we set config.SwitchUseFeedFetchOps(false) here.
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, 5, AnalysisConfig::Precision::kInt8, false, true /*use_calib*/);
+  return CreatePaddlePredictor(config);
+}
+
+void run(paddle::PaddlePredictor *predictor,
+         std::vector<float>& input,
+         const std::vector<int>& input_shape, 
+         std::vector<float> *out_data) {
+  int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+  for (size_t i = 0; i < 500; i++) {
+    // We use random data here for example. Change this to real data in your application. 
+    for (int j = 0; j < input_num; j++) {
+      input[j] = Random(0, 1.0);
+    }
+    auto input_names = predictor->GetInputNames();
+    auto input_t = predictor->GetInputTensor(input_names[0]);
+    input_t->Reshape(input_shape);
+    input_t->copy_from_cpu(input.data());
+
+    // Run predictor to generate calibration table. Can be very time-consuming.     
+    CHECK(predictor->ZeroCopyRun());
+  }
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  auto predictor = CreatePredictor();
+  std::vector<int> input_shape = {FLAGS_batch_size, 3, 224, 224}; 
+  // Init input as 1.0 here for example. You can also load preprocessed real pictures to vectors as input.
+  std::vector<float> input_data(FLAGS_batch_size * 3 * 224 * 224, 1.0); 
+  std::vector<float> out_data;
+  run(predictor.get(), input_data, input_shape, &out_data);
+  return 0;
+}
diff --git a/doc/fluid/Paddle-Inference/c++/paddle-trt/trt_int8_test.cc b/doc/fluid/Paddle-Inference/c++/paddle-trt/trt_int8_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..533d1cbe9fd04ee207c34adbcc1af16f294bf4cf
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/paddle-trt/trt_int8_test.cc
@@ -0,0 +1,79 @@
+#include <numeric>
+#include <iostream>
+#include <memory>
+#include <chrono>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "paddle/include/paddle_inference_api.h"
+
+using paddle::AnalysisConfig;
+
+DEFINE_string(model_file, "", "Path of the inference model file.");
+DEFINE_string(params_file, "", "Path of the inference params file.");
+DEFINE_string(model_dir, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Batch size.");
+DEFINE_bool(use_calib, true, "Whether to use calib. Set to true if you are using TRT calibration; Set to false if you are using PaddleSlim quant models.");
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
+  AnalysisConfig config;
+  if (FLAGS_model_dir != "") {
+    config.SetModel(FLAGS_model_dir);
+  } else {
+    config.SetModel(FLAGS_model_file,
+                    FLAGS_params_file);
+  }
+  config.EnableUseGpu(500, 0);
+  // We use ZeroCopy, so we set config.SwitchUseFeedFetchOps(false) here.
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, 5, AnalysisConfig::Precision::kInt8, false, FLAGS_use_calib);
+  return CreatePaddlePredictor(config);
+}
+
+void run(paddle::PaddlePredictor *predictor,
+         const std::vector<float>& input,
+         const std::vector<int>& input_shape, 
+         std::vector<float> *out_data) {
+  int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape(input_shape);
+  input_t->copy_from_cpu(input.data());
+
+  CHECK(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  // there is only one output of Resnet50
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+ 
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  auto predictor = CreatePredictor();
+  std::vector<int> input_shape = {FLAGS_batch_size, 3, 224, 224}; 
+  // Init input as 1.0 here for example. You can also load preprocessed real pictures to vectors as input.
+  std::vector<float> input_data(FLAGS_batch_size * 3 * 224 * 224, 1.0); 
+  std::vector<float> out_data;
+  run(predictor.get(), input_data, input_shape, &out_data);
+  // Print first 20 outputs
+  for (int i = 0; i < 20; i++) {
+    LOG(INFO) << out_data[i] << std::endl;
+  }
+  return 0;
+}
diff --git a/doc/fluid/Paddle-Inference/c++/resnet50/CMakeLists.txt b/doc/fluid/Paddle-Inference/c++/resnet50/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a1c02c9696190b3701719dc594e0a1353bed0510
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/resnet50/CMakeLists.txt
@@ -0,0 +1,95 @@
+project(cpp_inference_demo CXX C)
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
+
+
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -g")
+set(CMAKE_STATIC_LIBRARY_PREFIX "")
+message("flags" ${CMAKE_CXX_FLAGS})
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+if (USE_TENSORRT AND WITH_GPU)
+      include_directories("${TENSORRT_ROOT}/include")
+      link_directories("${TENSORRT_ROOT}/lib")
+endif()
+
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif()
+
+# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
+if(WITH_STATIC_LIB)
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    glog gflags protobuf z xxhash
+    ${EXTERNAL_LIB})
+
+if(WITH_GPU)
+  if (USE_TENSORRT)
+    set(DEPS ${DEPS}
+        ${TENSORRT_ROOT}/lib/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS}
+        ${TENSORRT_ROOT}/lib/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcublas${CMAKE_SHARED_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX} )
+endif()
+
+target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/doc/fluid/Paddle-Inference/c++/resnet50/README.md b/doc/fluid/Paddle-Inference/c++/resnet50/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..89ee2623df3fbc133564c0dca210704e6ae656b6
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/resnet50/README.md
@@ -0,0 +1,50 @@
+## 运行C++ ResNet50图像分类样例
+
+### 一：获取Resnet50模型
+
+点击[链接](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50.tar.gz)下载模型， 该模型在imagenet 数据集训练得到的，如果你想获取更多的**模型训练信息**，请访问[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification)。
+
+### 二：**样例编译**
+
+文件`resnet50_test.cc` 为预测的样例程序（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。  
+文件`CMakeLists.txt` 为编译构建文件。  
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+编译Resnet50样例，我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 根据预编译库中的version.txt信息判断是否将以下三个标记打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=OFF
+
+# 配置预测库的根目录
+LIB_DIR=${YOUR_LIB_DIR}/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.5_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+# TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./resnet50_test --model_file=${RESNET_MODEL_PATH}/ResNet/model --params_file=${RESNET_MODEL_PATH}/ResNet/params
+```
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
+
+### 更多链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
diff --git a/doc/fluid/Paddle-Inference/c++/resnet50/index.html b/doc/fluid/Paddle-Inference/c++/resnet50/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..f51c4a47b218e903456dea39243b8f922773018f
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/resnet50/index.html
@@ -0,0 +1,114 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+## 运行C++ ResNet50图像分类样例
+
+### 一：获取Resnet50模型
+
+点击[链接](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50.tar.gz)下载模型， 该模型在imagenet 数据集训练得到的，如果你想获取更多的**模型训练信息**，请访问[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification)。
+
+### 二：**样例编译**
+
+文件`resnet50_test.cc` 为预测的样例程序（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。  
+文件`CMakeLists.txt` 为编译构建文件。  
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+编译Resnet50样例，我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 根据预编译库中的version.txt信息判断是否将以下三个标记打开
+WITH_MKL=ON  
+WITH_GPU=ON  
+USE_TENSORRT=OFF
+
+# 配置预测库的根目录
+LIB_DIR=${YOUR_LIB_DIR}/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.5_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+# TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./resnet50_test --model_file=${RESNET_MODEL_PATH}/ResNet/model --params_file=${RESNET_MODEL_PATH}/ResNet/params
+```
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
+
+### 更多链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/c++/resnet50/resnet50_test.cc b/doc/fluid/Paddle-Inference/c++/resnet50/resnet50_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0418ba12aeaf12f696fadebdd2bbbf383152a3b
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/resnet50/resnet50_test.cc
@@ -0,0 +1,80 @@
+#include <numeric>
+#include <iostream>
+#include <memory>
+#include <chrono>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "paddle/include/paddle_inference_api.h"
+
+using paddle::AnalysisConfig;
+
+DEFINE_string(model_file, "", "Directory of the inference model.");
+DEFINE_string(params_file, "", "Directory of the inference model.");
+DEFINE_string(model_dir, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Directory of the inference model.");
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
+  AnalysisConfig config;
+  if (FLAGS_model_dir != "") {
+    config.SetModel(FLAGS_model_dir);
+  }
+  config.SetModel(FLAGS_model_file,
+                     FLAGS_params_file);
+  config.EnableUseGpu(100, 0);
+  // We use ZeroCopy, so we set config->SwitchUseFeedFetchOps(false)
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableMKLDNN();
+
+  // Open the memory optim.
+  config.EnableMemoryOptim();
+  return CreatePaddlePredictor(config);
+}
+
+void run(paddle::PaddlePredictor *predictor,
+         const std::vector<float>& input,
+         const std::vector<int>& input_shape, 
+         std::vector<float> *out_data) {
+  int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape(input_shape);
+  input_t->copy_from_cpu(input.data());
+
+  CHECK(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  // there is only one output of Resnet50
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  auto predictor = CreatePredictor();
+  std::vector<int> input_shape = {FLAGS_batch_size, 3, 224, 224}; 
+  // init 0 for the input. 
+  std::vector<float> input_data(FLAGS_batch_size * 3 * 224 * 224, 0); 
+  std::vector<float> out_data;
+  run(predictor.get(), input_data, input_shape, &out_data);
+  
+  for (auto e : out_data) {
+    LOG(INFO) << e << std::endl;
+  }
+  return 0;
+}
diff --git a/doc/fluid/Paddle-Inference/c++/resnet50/run_impl.sh b/doc/fluid/Paddle-Inference/c++/resnet50/run_impl.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4dd6b21191eab39af9cdef53bbfba14fd782da42
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/resnet50/run_impl.sh
@@ -0,0 +1,27 @@
+mkdir -p build
+cd build
+rm -rf *
+
+# same with the resnet50_test.cc
+DEMO_NAME=resnet50_test
+
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=OFF
+
+LIB_DIR=/paddle/trt_refine_int8/build/fluid_inference_install_dir
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.5_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+# TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+
+cmake .. -DPADDLE_LIB=${LIB_DIR} \
+  -DWITH_MKL=${WITH_MKL} \
+  -DDEMO_NAME=${DEMO_NAME} \
+  -DWITH_GPU=${WITH_GPU} \
+  -DWITH_STATIC_LIB=OFF \
+  -DUSE_TENSORRT=${USE_TENSORRT} \
+  -DCUDNN_LIB=${CUDNN_LIB} \
+  -DCUDA_LIB=${CUDA_LIB} \
+  -DTENSORRT_ROOT=${TENSORRT_ROOT}
+
+make -j
diff --git a/doc/fluid/Paddle-Inference/c++/yolov3/CMakeLists.txt b/doc/fluid/Paddle-Inference/c++/yolov3/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2206d484e5c271afc8bc00d06aaaee05b88c51fc
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/yolov3/CMakeLists.txt
@@ -0,0 +1,96 @@
+cmake_minimum_required(VERSION 3.0)
+project(cpp_inference_demo CXX C)
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
+
+
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(CMAKE_STATIC_LIBRARY_PREFIX "")
+message("flags" ${CMAKE_CXX_FLAGS})
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+if (USE_TENSORRT AND WITH_GPU)
+      include_directories("${TENSORRT_ROOT}/include")
+      link_directories("${TENSORRT_ROOT}/lib")
+endif()
+
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif()
+
+# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
+if(WITH_STATIC_LIB)
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    glog gflags protobuf z xxhash
+    ${EXTERNAL_LIB})
+
+if(WITH_GPU)
+  if (USE_TENSORRT)
+    set(DEPS ${DEPS}
+        ${TENSORRT_ROOT}/lib/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS}
+        ${TENSORRT_ROOT}/lib/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcublas${CMAKE_SHARED_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX} )
+endif()
+
+target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/doc/fluid/Paddle-Inference/c++/yolov3/README.md b/doc/fluid/Paddle-Inference/c++/yolov3/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d2a2ab0333799728807532cd35f235ca90359b39
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/yolov3/README.md
@@ -0,0 +1,50 @@
+## 运行C++ YOLOv3图像检测样例
+
+### 一：获取YOLOv3模型
+
+点击[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz)下载模型， 该模型在imagenet数据集训练得到的，如果你想获取更多的**模型训练信息**，请访问[这里](https://github.com/PaddlePaddle/PaddleDetection)。
+
+### 二：**样例编译**
+
+文件`yolov3_test.cc` 为预测的样例程序（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。  
+文件`CMakeLists.txt` 为编译构建文件。  
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+编译yolov3样例，我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 根据预编译库中的version.txt信息判断是否将以下三个标记打开
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=OFF
+
+# 配置预测库的根目录
+LIB_DIR=${YOUR_LIB_DIR}/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。
+CUDNN_LIB=/usr/local/cudnn/lib64
+CUDA_LIB=/usr/local/cuda/lib64
+# TENSORRT_ROOT=/usr/local/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./yolov3_test -model_file ${YOLO_MODEL_PATH}/__model__ --params_file ${YOLO_MODEL_PATH}/__params__
+```
+
+运行结束后，程序会将模型输出个数打印到屏幕，说明运行成功。
+
+### 更多链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
diff --git a/doc/fluid/Paddle-Inference/c++/yolov3/index.html b/doc/fluid/Paddle-Inference/c++/yolov3/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..945e90ebddc3e80707ebac317662f43cc1a4f645
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/yolov3/index.html
@@ -0,0 +1,114 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+## 运行C++ YOLOv3图像检测样例
+
+### 一：获取YOLOv3模型
+
+点击[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz)下载模型， 该模型在imagenet数据集训练得到的，如果你想获取更多的**模型训练信息**，请访问[这里](https://github.com/PaddlePaddle/PaddleDetection)。
+
+### 二：**样例编译**
+
+文件`yolov3_test.cc` 为预测的样例程序（程序中的输入为固定值，如果您有opencv或其他方式进行数据读取的需求，需要对程序进行一定的修改）。  
+文件`CMakeLists.txt` 为编译构建文件。  
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+编译yolov3样例，我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 根据预编译库中的version.txt信息判断是否将以下三个标记打开
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=OFF
+
+# 配置预测库的根目录
+LIB_DIR=${YOUR_LIB_DIR}/fluid_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。
+CUDNN_LIB=/usr/local/cudnn/lib64
+CUDA_LIB=/usr/local/cuda/lib64
+# TENSORRT_ROOT=/usr/local/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./yolov3_test -model_file ${YOLO_MODEL_PATH}/__model__ --params_file ${YOLO_MODEL_PATH}/__params__
+```
+
+运行结束后，程序会将模型输出个数打印到屏幕，说明运行成功。
+
+### 更多链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/c++/yolov3/run_impl.sh b/doc/fluid/Paddle-Inference/c++/yolov3/run_impl.sh
new file mode 100644
index 0000000000000000000000000000000000000000..34180be98427011b406f41146dcbf93425182f88
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/yolov3/run_impl.sh
@@ -0,0 +1,29 @@
+work_path=$(dirname $(readlink -f $0))
+
+mkdir -p build
+cd build
+rm -rf *
+
+# same with the yolov3_test.cc
+DEMO_NAME=yolov3_test
+
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=OFF
+
+LIB_DIR=${work_path}/fluid_inference_install_dir
+CUDNN_LIB=/usr/local/cudnn/lib64
+CUDA_LIB=/usr/local/cuda/lib64
+# TENSORRT_ROOT=/usr/local/TensorRT-6.0.1.5
+
+cmake .. -DPADDLE_LIB=${LIB_DIR} \
+  -DWITH_MKL=${WITH_MKL} \
+  -DDEMO_NAME=${DEMO_NAME} \
+  -DWITH_GPU=${WITH_GPU} \
+  -DWITH_STATIC_LIB=OFF \
+  -DUSE_TENSORRT=${USE_TENSORRT} \
+  -DCUDNN_LIB=${CUDNN_LIB} \
+  -DCUDA_LIB=${CUDA_LIB} \
+  -DTENSORRT_ROOT=${TENSORRT_ROOT}
+
+make -j
diff --git a/doc/fluid/Paddle-Inference/c++/yolov3/yolov3_test.cc b/doc/fluid/Paddle-Inference/c++/yolov3/yolov3_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7df5ade5d13b1270fda36066105a3b674bdc6060
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/c++/yolov3/yolov3_test.cc
@@ -0,0 +1,99 @@
+#include "paddle/include/paddle_inference_api.h"
+
+#include <numeric>
+#include <iostream>
+#include <memory>
+#include <chrono>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+using paddle::AnalysisConfig;
+
+DEFINE_string(model_file, "", "Directory of the inference model.");
+DEFINE_string(params_file, "", "Directory of the inference model.");
+DEFINE_string(model_dir, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Directory of the inference model.");
+DEFINE_bool(use_gpu, false, "enable gpu");
+DEFINE_bool(use_mkldnn, false, "enable mkldnn");
+DEFINE_bool(mem_optim, false, "enable memory optimize");
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
+  AnalysisConfig config;
+  if (FLAGS_model_dir != "") {
+    config.SetModel(FLAGS_model_dir);
+  } else {
+    config.SetModel(FLAGS_model_file,
+                    FLAGS_params_file);
+  }
+  if (FLAGS_use_gpu) {
+    config.EnableUseGpu(100, 0);
+  }
+  if (FLAGS_use_mkldnn) {
+    config.EnableMKLDNN();
+  }
+  // Open the memory optim.
+  if (FLAGS_mem_optim) {
+    config.EnableMemoryOptim();
+  }
+  // We use ZeroCopy, so we set config->SwitchUseFeedFetchOps(false)
+  config.SwitchUseFeedFetchOps(false);
+  return CreatePaddlePredictor(config);
+}
+
+void run(paddle::PaddlePredictor *predictor,
+         const std::vector<float>& input,
+         const std::vector<int>& input_shape, 
+         const std::vector<int>& input_im,
+         const std::vector<int>& input_im_shape,
+         std::vector<float> *out_data) {
+  auto input_names = predictor->GetInputNames();
+  auto input_img = predictor->GetInputTensor(input_names[0]);
+  input_img->Reshape(input_shape);
+  input_img->copy_from_cpu(input.data());
+
+  auto input_size = predictor->GetInputTensor(input_names[1]);
+  input_size->Reshape(input_im_shape);
+  input_size->copy_from_cpu(input_im.data());
+
+  CHECK(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  // there is only one output of yolov3
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  auto predictor = CreatePredictor();
+
+  const int height = 608;
+  const int width = 608;
+  const int channels = 3;
+  std::vector<int> input_shape = {FLAGS_batch_size, channels, height, width};
+  std::vector<float> input_data(FLAGS_batch_size * channels * height * width, 0);
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = i % 255 * 0.13f;
+  }
+  std::vector<int> input_im_shape = {FLAGS_batch_size, 2};
+  std::vector<int> input_im_data(FLAGS_batch_size * 2, 608);
+
+  std::vector<float> out_data;
+  run(predictor.get(), input_data, input_shape, input_im_data, input_im_shape, &out_data);
+  LOG(INFO) << "output num is " << out_data.size();
+  return 0;
+}
diff --git a/doc/fluid/Paddle-Inference/docs/Makefile b/doc/fluid/Paddle-Inference/docs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..298ea9e213e8c4c11f0431077510d4e325733c65
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/doc/fluid/Paddle-Inference/docs/README.md b/doc/fluid/Paddle-Inference/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/doc/fluid/Paddle-Inference/docs/api_reference/cxx_api_doc.md b/doc/fluid/Paddle-Inference/docs/api_reference/cxx_api_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..83494168ff46eff4a33f3fd70d5e333d31d90ff9
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/api_reference/cxx_api_doc.md
@@ -0,0 +1 @@
+# C++ API 文档
diff --git a/doc/fluid/Paddle-Inference/docs/benchmark/benchmark.rst b/doc/fluid/Paddle-Inference/docs/benchmark/benchmark.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aad6f6bd662b12ba3df910b4621175e76eeab694
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/benchmark/benchmark.rst
@@ -0,0 +1,30 @@
+性能数据
+=========
+
+GPU性能
+--------------
+
+**测试条件**
+
+- 测试模型
+	- Mobilenetv1
+	- Resnet50
+	- Yolov3
+	- Unet
+	- Bert/Ernie
+- 测试机器
+	- P4
+	- T4
+- 测试说明
+	- 测试Paddle版本：release/1.8
+	- warmup=10， repeats=1000，统计平均时间，单位ms。
+
+**性能数据**
+
+X86 CPU性能
+-------------
+
+**测试条件**
+
+**性能数据**
+
diff --git a/doc/fluid/Paddle-Inference/docs/conf.py b/doc/fluid/Paddle-Inference/docs/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..e65f90ccaa7d532094b1b6e680022c411ddbdf3b
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/conf.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+#sys.path.insert(0, os.path.abspath('.'))
+
+import sphinx_rtd_theme
+from recommonmark.parser import CommonMarkParser
+from recommonmark.transform import AutoStructify
+
+# -- Project information -----------------------------------------------------
+
+project = u'Paddle-Inference'
+copyright = u'2020, Paddle-Inference Developer'
+author = u'Paddle-Inference Developer'
+
+# The short X.Y version
+version = u'latest'
+# The full version, including alpha/beta/rc tags
+release = u''
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['recommonmark', 'sphinx_markdown_tables']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = ['.rst', '.md']
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Paddle-Inference doc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'Paddle-Inference.tex', u'Paddle-Inference Documentation',
+     u'Paddle-Inference Developer', 'manual'),
+]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'paddle-inference',
+              u'Paddle-Inference Documentation', [author], 1)]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Paddle-Inference', u'Paddle-Inference Documentation', author,
+     'Paddle-Inference', 'One line description of project.', 'Miscellaneous'),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+extensions = [
+    # there may be others here already, e.g. 'sphinx.ext.mathjax'
+    'breathe',
+    'exhale'
+]
+
+# Setup the breathe extension
+breathe_projects = {"My Project": "./doxyoutput/xml"}
+breathe_default_project = "My Project"
+
+# Setup the exhale extension
+exhale_args = {
+    # These arguments are required
+    "containmentFolder": "./api",
+    "rootFileName": "library_root.rst",
+    "rootFileTitle": "Library API",
+    "doxygenStripFromPath": "..",
+    # Suggested optional arguments
+    "createTreeView": True,
+    # TIP: if using the sphinx-bootstrap-theme, you need
+    # "treeViewIsBootstrap": True,
+    "exhaleExecutesDoxygen": True,
+    "exhaleDoxygenStdin": "INPUT = paddle_include_file"
+}
+
+# Tell sphinx what the primary language being documented is.
+primary_domain = 'cpp'
+
+# Tell sphinx what the pygments highlight language should be.
+highlight_language = 'cpp'
+
+import os
+
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+
+if not on_rtd:  # only import and set the theme if we're building docs locally
+    import sphinx_rtd_theme
+    html_theme = 'sphinx_rtd_theme'
+    #html_theme = "alabaster"
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
diff --git a/doc/fluid/Paddle-Inference/docs/images/architecture.png b/doc/fluid/Paddle-Inference/docs/images/architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..35cb336a0640c868d6fc1df738f039a0e7b5884d
Binary files /dev/null and b/doc/fluid/Paddle-Inference/docs/images/architecture.png differ
diff --git a/doc/fluid/Paddle-Inference/docs/index.html b/doc/fluid/Paddle-Inference/docs/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..efbfd8a4e368928d761789c0a475c863ab586e34
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/index.html
@@ -0,0 +1,64 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/docs/index.rst b/doc/fluid/Paddle-Inference/docs/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aef5605a99ed012f87857e5a3938b1fbeea92e68
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/index.rst
@@ -0,0 +1,66 @@
+.. Paddle-Inference documentation master file, created by
+   sphinx-quickstart on Thu Feb  6 14:11:30 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+################
+预测部署
+################
+
+
+Welcome to Paddle-Inference's documentation!
+=======================================
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 开始使用
+  :name: sec-introduction
+
+  introduction/summary
+  introduction/quick_start
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 使用方法
+  :name: sec-user-guides
+  
+  user_guides/tutorial
+  user_guides/source_compile
+  user_guides/inference_python_api
+  user_guides/cxx_api
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 性能调优
+  :name: sec-optimize
+  
+  optimize/paddle_trt
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 工具 
+  :name: sec-tools
+  
+  tools/visual
+  tools/x2paddle
+  
+.. toctree::
+  :maxdepth: 1
+  :caption: Benchmark
+  :name: sec-benchmark
+  
+  benchmark/benchmark
+  
+.. toctree::
+  :maxdepth: 2
+  :caption: API文档
+
+  api/library_root
+
+.. toctree::
+  :maxdepth: 1
+  :caption: FAQ
+
+  introduction/faq
+
+
diff --git a/doc/fluid/Paddle-Inference/docs/introduction/faq.rst b/doc/fluid/Paddle-Inference/docs/introduction/faq.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0c66722c62a6a24a5bd1ae7a7d709316a201cf64
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/introduction/faq.rst
@@ -0,0 +1,2 @@
+FAQ 常见问题 
+============
diff --git a/doc/fluid/Paddle-Inference/docs/introduction/quick_start.rst b/doc/fluid/Paddle-Inference/docs/introduction/quick_start.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c00ba9cea6292d5ed14f4da8aacc47df1b17154a
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/introduction/quick_start.rst
@@ -0,0 +1,176 @@
+Quick Start
+=================
+
+**前提准备**
+接下来我们会通过几段Python代码的方式对Paddle Inference使用进行介绍，
+为了能够成功运行代码，请您在环境中（Mac， Windows，Linux）安装不低于1.7版本的Paddle，
+安装Paddle 请参考 `飞桨官网主页 <https://www.paddlepaddle.org.cn/>`_。
+
+导出预测模型文件
+----------------
+
+在模型训练期间，我们通常使用Python来构建模型结构，比如：
+
+.. code:: python
+
+	import paddle.fluid as fluid
+	res = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu", param_attr=param_attr)
+
+在模型部署时，我们需要提前将这种Python表示的结构以及参数序列化到磁盘中。那是如何做到的呢？
+
+在模型训练过程中或者模型训练结束后，我们可以通过save_inference_model接口来导出标准化的模型文件。    
+
+我们用一个简单的代码例子来展示下导出模型文件的这一过程。
+
+
+.. code:: python
+
+	import paddle
+	import paddle.fluid as fluid
+	# 建立一个简单的网络，网络的输入的shape为[batch, 3, 28, 28]
+	image_shape = [3, 28, 28]
+
+	img = fluid.layers.data(name='image', shape=image_shape, dtype='float32', append_batch_size=True)
+	# 模型包含两个Conv层
+	conv1 = fluid.layers.conv2d(
+		input=img,
+		num_filters=8,
+		filter_size=3,
+		stride=2,
+		padding=1,
+		groups=1,
+		act=None,
+		bias_attr=True)
+
+	out = fluid.layers.conv2d(
+		input=conv1,
+		num_filters=8,
+		filter_size=3,
+		stride=2,
+		padding=1,
+		groups=1,
+		act=None,
+		bias_attr=True)
+
+	place = fluid.CPUPlace()
+	exe = fluid.Executor(place)
+	# 创建网络中的参数变量，并初始化参数变量
+	exe.run(fluid.default_startup_program())
+
+	# 如果存在预训练模型
+	# def if_exist(var):
+	#            return os.path.exists(os.path.join("./ShuffleNet", var.name))
+	#    fluid.io.load_vars(exe, "./pretrained_model", predicate=if_exist)
+	# 保存模型到model目录中，只保存与输入image和输出与推理相关的部分网络
+	fluid.io.save_inference_model(dirname='./sample_model', feeded_var_names=['image'], target_vars = [out], executor=exe, model_filename='model', params_filename='params')
+
+该程序运行结束后，会在本目录中生成一个sample_model目录，目录中包含model, params 两个文件，model文件表示模型的结构文件，params表示所有参数的融合文件。 
+
+
+飞桨提供了 **两种标准** 的模型文件，一种为Combined方式， 一种为No-Combined的方式。
+
+- Combined的方式
+
+.. code:: python
+
+	fluid.io.save_inference_model(dirname='./sample_model', feeded_var_names=['image'], target_vars = [out], executor=exe, model_filename='model', params_filename='params')
+
+model_filename，params_filename表示要生成的模型结构文件、融合参数文件的名字。
+
+
+* No-Combined的方式  
+
+.. code:: python
+
+	fluid.io.save_inference_model(dirname='./sample_model', feeded_var_names=['image'], target_vars = [out], executor=exe)
+
+如果不指定model_filename，params_filename，会在sample_model目录下生成__model__ 模型结构文件，以及一系列的参数文件。
+
+
+在模型部署期间，**我们更推荐使用Combined的方式**，因为涉及模型上线加密的场景时，这种方式会更友好一些。
+
+
+
+加载模型预测
+----------------
+
+1）使用load_inference方式
+
+我们可以使用load_inference_model接口加载训练好的模型（以sample_model模型举例），并复用训练框架的前向计算，直接完成推理。
+示例程序如下所示：
+
+.. code:: python
+
+	import paddle.fluid as fluid
+	import numpy as np
+
+	data = np.ones((1, 3, 28, 28)).astype(np.float32)
+	exe = fluid.Executor(fluid.CPUPlace())
+
+	# 加载Combined的模型需要指定model_filename, params_filename
+	# 加载No-Combined的模型不需要指定model_filename, params_filename
+	[inference_program, feed_target_names, fetch_targets] = \
+		fluid.io.load_inference_model(dirname='sample_model', executor=exe, model_filename='model', params_filename='params')
+
+	with fluid.program_guard(inference_program):
+	results = exe.run(inference_program,
+		feed={feed_target_names[0]: data},
+		fetch_list=fetch_targets, return_numpy=False)
+
+	print (np.array(results[0]).shape)
+	# (1, 8, 7, 7)
+
+在上述方式中，在模型加载后会按照执行顺序将所有的OP进行拓扑排序，在运行期间Op会按照排序一一运行，整个过程中运行的为训练中前向的OP，期间不会有任何的优化（OP融合，显存优化，预测Kernel针对优化）。 因此，load_inference_model的方式预测期间很可能不会有很好的性能表现，此方式比较适合用来做实验（测试模型的效果、正确性等）使用，并不适用于真正的部署上线。接下来我们会重点介绍Paddle Inference的使用。
+
+2）使用Paddle Inference API方式
+
+不同于 load_inference_model方式，Paddle Inference 在模型加载后会进行一系列的优化，包括： Kernel优化，OP横向，纵向融合，显存/内存优化，以及MKLDNN，TensorRT的集成等，性能和吞吐会得到大幅度的提升。这些优化会在之后的文档中进行详细的介绍。
+
+那我们先用一个简单的代码例子来介绍Paddle Inference 的使用。
+
+.. code::
+
+	from paddle.fluid.core import AnalysisConfig
+	from paddle.fluid.core import create_paddle_predictor
+
+	import numpy as np
+
+	# 配置运行信息
+	# config = AnalysisConfig("./sample_model") # 加载non-combined 模型格式
+	config = AnalysisConfig("./sample_model/model", "./sample_model/params") # 加载combine的模型格式
+
+	config.switch_use_feed_fetch_ops(False)
+	config.enable_memory_optim()
+	config.enable_use_gpu(1000, 0)
+
+	# 根据config创建predictor
+	predictor = create_paddle_predictor(config)
+
+	img = np.ones((1, 3, 28, 28)).astype(np.float32)
+
+	# 准备输入
+	input_names = predictor.get_input_names()
+	input_tensor = predictor.get_input_tensor(input_names[0])
+	input_tensor.reshape(img.shape)   
+	input_tensor.copy_from_cpu(img.copy())
+
+	# 运行
+	predictor.zero_copy_run()
+
+	# 获取输出
+	output_names = predictor.get_output_names()
+	output_tensor = predictor.get_output_tensor(output_names[0])
+	output_data = output_tensor.copy_to_cpu()
+
+	print (output_data)
+
+上述的代码例子，我们通过加载一个简答模型以及随机输入的方式，展示了如何使用Paddle Inference进行模型预测。可能对于刚接触Paddle Inferenece同学来说，代码中会有一些陌生名词出现，比如AnalysisConfig, Predictor 等。先不要着急，接下来的文章中会对这些概念进行详细的介绍。 
+
+
+**相关链接**
+
+`Python API 使用介绍 <../user_guides/inference_python_api.html>`_
+`C++ API使用介绍 <../user_guides/cxx_api.html>`_
+`Python 使用样例 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python>`_
+`C++ 使用样例 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B>`_
+
diff --git a/doc/fluid/Paddle-Inference/docs/introduction/summary.rst b/doc/fluid/Paddle-Inference/docs/introduction/summary.rst
new file mode 100644
index 0000000000000000000000000000000000000000..16ead666d12c2c2b74b5bdfd68b6acd6c3a6a39c
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/introduction/summary.rst
@@ -0,0 +1,44 @@
+概述
+========
+
+Paddle Inference为飞桨核心框架推理引擎。Paddle Inference功能特性丰富，性能优异，针对不同平台不同的应用场景进行了深度的适配优化,做到高吞吐、低时延，保证了飞桨模型在服务器端即训即用，快速部署。    
+
+特性
+-------
+
+- 通用性。支持对Paddle训练出的所有模型进行预测。
+
+- 内存/显存复用。在推理初始化阶段，对模型中的OP输出Tensor 进行依赖分析，将两两互不依赖的Tensor在内存/显存空间上进行复用，进而增大计算并行量，提升服务吞吐量。
+
+
+- 细粒度OP融合。在推理初始化阶段，按照已有的融合模式将模型中的多个OP融合成一个OP，减少了模型的计算量的同时，也减少了 Kernel Launch的次数，从而能提升推理性能。目前Paddle Inference支持的融合模式多达几十个。
+
+
+- 高性能CPU/GPU Kernel。内置同Intel、Nvidia共同打造的高性能kernel，保证了模型推理高性能的执行。
+
+
+- 子图集成 `TensorRT <https://developer.nvidia.com/tensorrt>`_。Paddle Inference采用子图的形式集成TensorRT，针对GPU推理场景，TensorRT可对一些子图进行优化，包括OP的横向和纵向融合，过滤冗余的OP，并为OP自动选择最优的kernel，加快推理速度。
+
+
+- 集成MKLDNN
+   
+- 支持加载PaddleSlim量化压缩后的模型。 `PaddleSlim <https://github.com/PaddlePaddle/PaddleSlim>`_ 是飞桨深度学习模型压缩工具，Paddle Inference可联动PaddleSlim，支持加载量化、裁剪和蒸馏后的模型并部署，由此减小模型存储空间、减少计算占用内存、加快模型推理速度。其中在模型量化方面，`Paddle Inference在X86 CPU上做了深度优化 <https://github.com/PaddlePaddle/PaddleSlim/tree/80c9fab3f419880dd19ca6ea30e0f46a2fedf6b3/demo/mkldnn_quant/quant_aware>`_ ，常见分类模型的单线程性能可提升近3倍，ERNIE模型的单线程性能可提升2.68倍。
+	
+支持系统及硬件   
+------------
+
+支持服务器端X86 CPU、NVIDIA GPU芯片，兼容Linux/macOS/Windows系统。     
+
+同时也支持NVIDIA Jetson嵌入式平台。
+
+语言支持
+------------
+
+- 支持Pyhton语言
+- 支持C++ 语言 
+- 支持Go语言 
+- 支持R语言  
+	
+**下一步**
+
+- 如果您刚接触Paddle Inference， 请访问 `Quick start <./quick_start.html>`_。
diff --git a/doc/fluid/Paddle-Inference/docs/make.bat b/doc/fluid/Paddle-Inference/docs/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..7893348a1b7dbb588983a48e6991282eae7e1b55
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/doc/fluid/Paddle-Inference/docs/optimize/paddle_trt.rst b/doc/fluid/Paddle-Inference/docs/optimize/paddle_trt.rst
new file mode 100644
index 0000000000000000000000000000000000000000..79e8ccc14602e66da053b1c1b7b65c4b1304285d
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/optimize/paddle_trt.rst
@@ -0,0 +1,334 @@
+使用Paddle-TensorRT库预测
+================
+
+NVIDIA TensorRT 是一个高性能的深度学习预测库，可为深度学习推理应用程序提供低延迟和高吞吐量。PaddlePaddle 采用子图的形式对TensorRT进行了集成，即我们可以使用该模块来提升Paddle模型的预测性能。在这篇文章中，我们会介绍如何使用Paddle-TRT子图加速预测。
+
+概述
+----------------
+
+当模型加载后，神经网络可以表示为由变量和运算节点组成的计算图。如果我们打开TRT子图模式，在图分析阶段，Paddle会对模型图进行分析同时发现图中可以使用TensorRT优化的子图，并使用TensorRT节点替换它们。在模型的推断期间，如果遇到TensorRT节点，Paddle会调用TensorRT库对该节点进行优化，其他的节点调用Paddle的原生实现。TensorRT除了有常见的OP融合以及显存/内存优化外，还针对性的对OP进行了优化加速实现，降低预测延迟，提升推理吞吐。
+
+目前Paddle-TRT支持静态shape模式以及/动态shape模式。在静态shape模式下支持图像分类，分割，检测模型，同时也支持Fp16， Int8的预测加速。在动态shape模式下，除了对动态shape的图像模型（FCN， Faster rcnn）支持外，同时也对NLP的Bert/Ernie模型也进行了支持。 
+
+**Paddle-TRT的现有能力：**
+
+**1）静态shape：**
+
+支持模型：
+
+===========  =============  ========
+ 分类模型      检测模型     分割模型
+===========  =============  ========
+Mobilenetv1  yolov3         ICNET
+Resnet50     SSD            UNet
+Vgg16        Mask-rcnn      FCN
+Resnext      Faster-rcnn
+AlexNet      Cascade-rcnn
+Se-ResNext   Retinanet
+GoogLeNet    Mobilenet-SSD
+DPN
+===========  =============  ========
+
+.. |check| raw:: html
+
+    <input checked=""  type="checkbox">
+
+.. |check_| raw:: html
+
+    <input checked=""  disabled="" type="checkbox">
+
+.. |uncheck| raw:: html
+
+    <input type="checkbox">
+
+.. |uncheck_| raw:: html
+
+    <input disabled="" type="checkbox">
+
+Fp16: |check|
+
+Calib Int8: |check|
+
+优化信息序列化: |check|
+
+加载PaddleSlim Int8模型: |check|
+
+
+**2）动态shape：**
+
+支持模型：
+
+===========  =====
+   图像       NLP
+===========  =====
+FCN          Bert
+Faster_RCNN  Ernie
+===========  =====
+
+Fp16: |check|
+
+Calib Int8: |uncheck|
+
+优化信息序列化: |uncheck|
+
+加载PaddleSlim Int8模型: |uncheck|
+
+
+**Note:**
+
+1. 从源码编译时，TensorRT预测库目前仅支持使用GPU编译，且需要设置编译选项TENSORRT_ROOT为TensorRT所在的路径。
+2. Windows支持需要TensorRT 版本5.0以上。
+3. 使用Paddle-TRT的动态shape输入功能要求TRT的版本在6.0以上。
+
+
+一：环境准备
+-------------
+
+使用Paddle-TRT功能，我们需要准备带TRT的Paddle运行环境，我们提供了以下几种方式：
+
+1）linux下通过pip安装
+
+.. code:: shell
+
+	# 该whl包依赖cuda10.1， cudnnv7.6， tensorrt6.0 的lib， 需自行下载安装并设置lib路径到LD_LIBRARY_PATH中
+	wget https://paddle-inference-dist.bj.bcebos.com/libs/paddlepaddle_gpu-1.8.0-cp27-cp27mu-linux_x86_64.whl
+	pip install -U paddlepaddle_gpu-1.8.0-cp27-cp27mu-linux_x86_64.whl
+
+
+如果您想在Nvidia Jetson平台上使用，请点击此 `链接 <https://paddle-inference-dist.cdn.bcebos.com/temp_data/paddlepaddle_gpu-0.0.0-cp36-cp36m-linux_aarch64.whl>`_ 下载whl包，然后通过pip 安装。
+
+2）使用docker镜像
+
+.. code:: shell
+
+	# 拉取镜像，该镜像预装Paddle 1.8 Python环境，并包含c++的预编译库，lib存放在主目录～/ 下。
+	docker pull hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6
+
+	export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+	export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+	export NVIDIA_SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
+
+	docker run $CUDA_SO $DEVICES $NVIDIA_SMI --name trt_open --privileged --security-opt seccomp=unconfined --net=host -v $PWD:/paddle -it hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6 /bin/bash
+
+3）手动编译  
+编译的方式请参照 `编译文档 <../user_guides/source_compile.html>`_ 
+
+**Note1：** cmake 期间请设置 TENSORRT_ROOT （即TRT lib的路径）， WITH_PYTHON （是否产出python whl包， 设置为ON）选项。
+
+**Note2:** 编译期间会出现TensorRT相关的错误。
+
+需要手动在 NvInfer.h (trt5) 或 NvInferRuntime.h (trt6) 文件中为 class IPluginFactory 和 class IGpuAllocator 分别添加虚析构函数：
+
+.. code:: c++
+
+	virtual ~IPluginFactory() {};
+	virtual ~IGpuAllocator() {};
+	
+需要将 `NvInferRuntime.h` (trt6)中的 **protected: ~IOptimizationProfile() noexcept = default;**
+
+改为
+
+.. code:: c++
+
+	virtual ~IOptimizationProfile() noexcept = default;
+	
+
+
+二：API使用介绍
+-----------------
+
+在 `使用流程 <../user_guides/tutorial.html>`_ 一节中，我们了解到Paddle Inference预测包含了以下几个方面：
+
+- 配置推理选项
+- 创建predictor
+- 准备模型输入
+- 模型推理
+- 获取模型输出
+
+使用Paddle-TRT 也是遵照这样的流程。我们先用一个简单的例子来介绍这一流程（我们假设您已经对Paddle Inference有一定的了解，如果您刚接触Paddle Inference，请访问 `这里 <../introduction/quick_start>`_ 对Paddle Inference有个初步认识。）：
+
+.. code:: python
+
+	import numpy as np
+	from paddle.fluid.core import AnalysisConfig
+	from paddle.fluid.core import create_paddle_predictor
+
+	def create_predictor():
+		# config = AnalysisConfig("")
+		config = AnalysisConfig("./resnet50/model", "./resnet50/params")
+		config.switch_use_feed_fetch_ops(False)
+		config.enable_memory_optim()
+		config.enable_use_gpu(1000, 0)
+   
+		# 打开TensorRT。此接口的详细介绍请见下文
+		config.enable_tensorrt_engine(workspace_size = 1<<30, 
+			max_batch_size=1, min_subgraph_size=5,
+			precision_mode=AnalysisConfig.Precision.Float32,
+			use_static=False, use_calib_mode=False)
+
+		predictor = create_paddle_predictor(config)
+		return predictor
+   
+	def run(predictor, img):
+		# 准备输入
+		input_names = predictor.get_input_names()
+		for i,  name in enumerate(input_names):
+			input_tensor = predictor.get_input_tensor(name)
+			input_tensor.reshape(img[i].shape)   
+			input_tensor.copy_from_cpu(img[i].copy())
+		# 预测
+		predictor.zero_copy_run()
+		results = []
+		# 获取输出
+		output_names = predictor.get_output_names()
+		for i, name in enumerate(output_names):
+			output_tensor = predictor.get_output_tensor(name)
+			output_data = output_tensor.copy_to_cpu()
+			results.append(output_data)
+		return results
+
+		if __name__ == '__main__':
+			pred = create_predictor() 
+			img = np.ones((1, 3, 224, 224)).astype(np.float32)
+			result = run(pred, [img]) 
+			print ("class index: ", np.argmax(result[0][0]))
+		
+
+通过例子我们可以看出，我们通过 `enable_tensorrt_engine` 接口来打开TensorRT选项的。
+
+.. code:: python
+
+	config.enable_tensorrt_engine(
+		workspace_size = 1<<30,
+ 		max_batch_size=1, min_subgraph_size=5,
+ 		precision_mode=AnalysisConfig.Precision.Float32,
+		use_static=False, use_calib_mode=False)
+
+
+接下来让我们看下该接口中各个参数的作用:  
+
+- **workspace_size**，类型：int，默认值为1 << 30 （1G）。指定TensorRT使用的工作空间大小，TensorRT会在该大小限制下筛选最优的kernel执行预测运算。
+- **max_batch_size**，类型：int，默认值为1。需要提前设置最大的batch大小，运行时batch大小不得超过此限定值。
+- **min_subgraph_size**，类型：int，默认值为3。Paddle-TRT是以子图的形式运行，为了避免性能损失，当子图内部节点个数大于 min_subgraph_size 的时候，才会使用Paddle-TRT运行。
+- **precision_mode**，类型：**AnalysisConfig.Precision**, 默认值为 **AnalysisConfig.Precision.Float32**。指定使用TRT的精度，支持FP32（Float32），FP16（Half），Int8（Int8）。若需要使用Paddle-TRT int8离线量化校准，需设定precision为 **AnalysisConfig.Precision.Int8** , 且设置 **use_calib_mode** 为True。
+- **use_static**，类型：bool, 默认值为False。如果指定为True，在初次运行程序的时候会将TRT的优化信息进行序列化到磁盘上，下次运行时直接加载优化的序列化信息而不需要重新生成。
+- **use_calib_mode**，类型：bool, 默认值为False。若要运行Paddle-TRT int8离线量化校准，需要将此选项设置为True。
+
+Int8量化预测
+>>>>>>>>>>>>>>
+
+神经网络的参数在一定程度上是冗余的，在很多任务上，我们可以在保证模型精度的前提下，将Float32的模型转换成Int8的模型，从而达到减小计算量降低运算耗时、降低计算内存、降低模型大小的目的。使用Int8量化预测的流程可以分为两步：1）产出量化模型；2）加载量化模型进行Int8预测。下面我们对使用Paddle-TRT进行Int8量化预测的完整流程进行详细介绍。
+
+**1. 产出量化模型**
+
+目前，我们支持通过两种方式产出量化模型：
+
+a. 使用TensorRT自带Int8离线量化校准功能。校准即基于训练好的FP32模型和少量校准数据（如500～1000张图片）生成校准表（Calibration table），预测时，加载FP32模型和此校准表即可使用Int8精度预测。生成校准表的方法如下：
+
+  - 指定TensorRT配置时，将 **precision_mode** 设置为 **AnalysisConfig.Precision.Int8** 并且设置 **use_calib_mode** 为 **True**。
+
+    .. code:: python
+
+      config.enable_tensorrt_engine(
+        workspace_size=1<<30,
+        max_batch_size=1, min_subgraph_size=5,
+        precision_mode=AnalysisConfig.Precision.Int8,
+        use_static=False, use_calib_mode=True)
+
+  - 准备500张左右的真实输入数据，在上述配置下，运行模型。（Paddle-TRT会统计模型中每个tensor值的范围信息，并将其记录到校准表中，运行结束后，会将校准表写入模型目录下的 `_opt_cache` 目录中）
+
+  如果想要了解使用TensorRT自带Int8离线量化校准功能生成校准表的完整代码，请参考 `这里 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/paddle-trt/README.md#%E7%94%9F%E6%88%90%E9%87%8F%E5%8C%96%E6%A0%A1%E5%87%86%E8%A1%A8>`_ 的demo。
+
+b. 使用模型压缩工具库PaddleSlim产出量化模型。PaddleSlim支持离线量化和在线量化功能，其中，离线量化与TensorRT离线量化校准原理相似；在线量化又称量化训练(Quantization Aware Training, QAT)，是基于较多数据（如>=5000张图片）对预训练模型进行重新训练，使用模拟量化的思想，在训练阶段更新权重，实现减小量化误差的方法。使用PaddleSlim产出量化模型可以参考文档：
+  
+  - 离线量化 `快速开始教程 <https://paddlepaddle.github.io/PaddleSlim/quick_start/quant_post_tutorial.html>`_
+  - 离线量化 `API接口说明 <https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#quant-post>`_
+  - 离线量化 `Demo <https://github.com/PaddlePaddle/PaddleSlim/tree/release/1.1.0/demo/quant/quant_post>`_
+  - 量化训练 `快速开始教程 <https://paddlepaddle.github.io/PaddleSlim/quick_start/quant_aware_tutorial.html>`_
+  - 量化训练 `API接口说明 <https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#quant-aware>`_
+  - 量化训练 `Demo <https://github.com/PaddlePaddle/PaddleSlim/tree/release/1.1.0/demo/quant/quant_aware>`_
+
+离线量化的优点是无需重新训练，简单易用，但量化后精度可能受影响；量化训练的优点是模型精度受量化影响较小，但需要重新训练模型，使用门槛稍高。在实际使用中，我们推荐先使用TRT离线量化校准功能生成量化模型，若精度不能满足需求，再使用PaddleSlim产出量化模型。
+  
+**2. 加载量化模型进行Int8预测**       
+
+  加载量化模型进行Int8预测，需要在指定TensorRT配置时，将 **precision_mode** 设置为 **AnalysisConfig.Precision.Int8** 。
+
+  若使用的量化模型为TRT离线量化校准产出的，需要将 **use_calib_mode** 设为 **True** ：
+
+  .. code:: python
+
+    config.enable_tensorrt_engine(
+      workspace_size=1<<30,
+      max_batch_size=1, min_subgraph_size=5,
+      precision_mode=AnalysisConfig.Precision.Int8,
+      use_static=False, use_calib_mode=True)
+
+  完整demo请参考 `这里 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/paddle-trt/README.md#%E5%8A%A0%E8%BD%BD%E6%A0%A1%E5%87%86%E8%A1%A8%E6%89%A7%E8%A1%8Cint8%E9%A2%84%E6%B5%8B>`_ 。
+  
+  若使用的量化模型为PaddleSlim量化产出的，需要将 **use_calib_mode** 设为 **False** ：
+
+  .. code:: python
+
+    config.enable_tensorrt_engine(
+      workspace_size=1<<30,
+      max_batch_size=1, min_subgraph_size=5,
+      precision_mode=AnalysisConfig.Precision.Int8,
+      use_static=False, use_calib_mode=False)
+
+  完整demo请参考 `这里 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B/paddle-trt/README.md#%E4%B8%89%E4%BD%BF%E7%94%A8trt-%E5%8A%A0%E8%BD%BDpaddleslim-int8%E9%87%8F%E5%8C%96%E6%A8%A1%E5%9E%8B%E9%A2%84%E6%B5%8B>`_ 。
+
+运行Dynamic shape
+>>>>>>>>>>>>>>
+
+从1.8 版本开始， Paddle对TRT子图进行了Dynamic shape的支持。
+使用接口如下：
+
+.. code:: python
+
+	config.enable_tensorrt_engine(
+		workspace_size = 1<<30,
+		max_batch_size=1, min_subgraph_size=5,
+		precision_mode=AnalysisConfig.Precision.Float32,
+		use_static=False, use_calib_mode=False)
+		  
+	min_input_shape = {"image":[1,3, 10, 10]}
+	max_input_shape = {"image":[1,3, 224, 224]}
+	opt_input_shape = {"image":[1,3, 100, 100]}
+
+	config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape)
+
+
+
+从上述使用方式来看，在 config.enable_tensorrt_engine 接口的基础上，新加了一个config.set_trt_dynamic_shape_info 的接口。     
+
+该接口用来设置模型输入的最小，最大，以及最优的输入shape。 其中，最优的shape处于最小最大shape之间，在预测初始化期间，会根据opt shape对op选择最优的kernel。   
+
+调用了 **config.set_trt_dynamic_shape_info** 接口，预测器会运行TRT子图的动态输入模式，运行期间可以接受最小，最大shape间的任意的shape的输入数据。
+
+
+
+三：测试样例
+-------------
+
+我们在github上提供了使用TRT子图预测的更多样例：
+
+- Python 样例请访问此处 `链接 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python/paddle_trt>`_ 。
+- C++ 样例地址请访问此处 `链接 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B>`_ 。
+
+四：Paddle-TRT子图运行原理
+---------------
+
+   PaddlePaddle采用子图的形式对TensorRT进行集成，当模型加载后，神经网络可以表示为由变量和运算节点组成的计算图。Paddle TensorRT实现的功能是对整个图进行扫描，发现图中可以使用TensorRT优化的子图，并使用TensorRT节点替换它们。在模型的推断期间，如果遇到TensorRT节点，Paddle会调用TensorRT库对该节点进行优化，其他的节点调用Paddle的原生实现。TensorRT在推断期间能够进行Op的横向和纵向融合，过滤掉冗余的Op，并对特定平台下的特定的Op选择合适的kernel等进行优化，能够加快模型的预测速度。  
+
+下图使用一个简单的模型展示了这个过程：  
+
+**原始网络**
+
+	.. image:: https://raw.githubusercontent.com/NHZlX/FluidDoc/add_trt_doc/doc/fluid/user_guides/howto/inference/image/model_graph_original.png
+
+**转换的网络**
+
+	.. image:: https://raw.githubusercontent.com/NHZlX/FluidDoc/add_trt_doc/doc/fluid/user_guides/howto/inference/image/model_graph_trt.png
+
+ 我们可以在原始模型网络中看到，绿色节点表示可以被TensorRT支持的节点，红色节点表示网络中的变量，黄色表示Paddle只能被Paddle原生实现执行的节点。那些在原始网络中的绿色节点被提取出来汇集成子图，并由一个TensorRT节点代替，成为转换后网络中的 **block-25** 节点。在网络运行过程中，如果遇到该节点，Paddle将调用TensorRT库来对其执行。
diff --git a/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_analysis_config.h b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_analysis_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2002d1f76abfeb8c35fcad51c8c1bcc16db78336
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_analysis_config.h
@@ -0,0 +1,579 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file paddle_analysis_config.h
+///
+/// \brief Paddle Analysis Config API信息
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-03-20
+/// \since 1.7
+///
+
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+/*! \file */
+
+// Here we include some header files with relative paths, for that in deploy,
+// the abstract path of this header file will be changed.
+#include "paddle_api.h"           // NOLINT
+#include "paddle_pass_builder.h"  // NOLINT
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle_mkldnn_quantizer_config.h"  // NOLINT
+#endif
+
+namespace paddle {
+
+class AnalysisPredictor;
+struct MkldnnQuantizerConfig;
+
+///
+/// \brief configuration manager for AnalysisPredictor.
+/// \since 1.7.0
+///
+/// AnalysisConfig manages configurations of AnalysisPredictor.
+/// During inference procedure, there are many parameters(model/params path,
+/// place of inference, etc.)
+/// to be specified, and various optimizations(subgraph fusion, memory
+/// optimazation, TensorRT engine, etc.)
+/// to be done. Users can manage these settings by creating and modifying an
+/// AnalysisConfig,
+/// and loading it into AnalysisPredictor.
+///
+struct AnalysisConfig {
+  AnalysisConfig() = default;
+  ///
+  /// \brief Construct a new AnalysisConfig from another
+  /// AnalysisConfig.
+  ///
+  /// \param[in] other another AnalysisConfig
+  ///
+  explicit AnalysisConfig(const AnalysisConfig& other);
+  ///
+  /// \brief Construct a new AnalysisConfig from a no-combined model.
+  ///
+  /// \param[in] model_dir model directory of the no-combined model.
+  ///
+  explicit AnalysisConfig(const std::string& model_dir);
+  ///
+  /// \brief Construct a new AnalysisConfig from a combined model.
+  ///
+  /// \param[in] prog_file model file path of the combined model.
+  /// \param[in] params_file params file path of the combined model.
+  ///
+  explicit AnalysisConfig(const std::string& prog_file,
+                          const std::string& params_file);
+  ///
+  /// \brief Precision of inference in TensorRT.
+  ///
+  enum class Precision {
+    kFloat32 = 0,  ///< fp32
+    kInt8,         ///< int8
+    kHalf,         ///< fp16
+  };
+
+  ///
+  /// \brief Set the no-combined model dir path.
+  ///
+  /// \param model_dir model dir path.
+  ///
+  void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
+
+  ///
+  /// \brief Set the combined model with two specific pathes for program and
+  /// parameters.
+  ///
+  /// \param prog_file_path model file path of the combined model.
+  /// \param params_file_path params file path of the combined model.
+  ///
+  void SetModel(const std::string& prog_file_path,
+                const std::string& params_file_path);
+  ///
+  /// \brief Set the model file path of a combined model.
+  ///
+  /// \param x model file path.
+  ///
+  void SetProgFile(const std::string& x) { prog_file_ = x; }
+  ///
+  /// \brief Set the params file path of a combined model.
+  ///
+  /// \param x params file path.
+  ///
+  void SetParamsFile(const std::string& x) { params_file_ = x; }
+
+  ///
+  /// \brief Set the path of optimization cache directory.
+  ///
+  /// \param opt_cache_dir the path of optimization cache directory.
+  ///
+  void SetOptimCacheDir(const std::string& opt_cache_dir) {
+    opt_cache_dir_ = opt_cache_dir;
+  }
+  ///
+  /// \brief Get the model directory path.
+  ///
+  /// \return const std::string& The model directory path.
+  ///
+  const std::string& model_dir() const { return model_dir_; }
+  ///
+  /// \brief Get the program file path.
+  ///
+  /// \return const std::string& The program file path.
+  ///
+  const std::string& prog_file() const { return prog_file_; }
+  ///
+  /// \brief Get the combined parameters file.
+  ///
+  /// \return const std::string& The combined parameters file.
+  ///
+  const std::string& params_file() const { return params_file_; }
+
+  // Padding related.
+
+  ///
+  /// \brief Turn off FC Padding.
+  ///
+  ///
+  void DisableFCPadding();
+  ///
+  /// \brief A boolean state telling whether fc padding is used.
+  ///
+  /// \return bool Whether fc padding is used.
+  ///
+  bool use_fc_padding() const { return use_fc_padding_; }
+
+  // GPU related.
+
+  ///
+  /// \brief Turn on GPU.
+  ///
+  /// \param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
+  /// \param device_id device_id the GPU card to use (default is 0).
+  ///
+  void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
+  ///
+  /// \brief Turn off GPU.
+  ///
+  ///
+  void DisableGpu();
+  ///
+  /// \brief A boolean state telling whether the GPU is turned on.
+  ///
+  /// \return bool Whether the GPU is turned on.
+  ///
+  bool use_gpu() const { return use_gpu_; }
+  ///
+  /// \brief Get the GPU device id.
+  ///
+  /// \return int The GPU device id.
+  ///
+  int gpu_device_id() const { return device_id_; }
+  ///
+  /// \brief Get the initial size in MB of the GPU memory pool.
+  ///
+  /// \return int The initial size in MB of the GPU memory pool.
+  ///
+  int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
+  ///
+  /// \brief Get the proportion of the initial memory pool size compared to the
+  /// device.
+  ///
+  /// \return float The proportion of the initial memory pool size.
+  ///
+  float fraction_of_gpu_memory_for_pool() const;
+
+  // CUDNN related.
+  ///
+  /// \brief Turn on CUDNN.
+  ///
+  ///
+  void EnableCUDNN();
+  ///
+  /// \brief A boolean state telling whether to use CUDNN.
+  ///
+  /// \return bool Whether to use CUDNN.
+  ///
+  bool cudnn_enabled() const { return use_cudnn_; }
+
+  ///
+  /// \brief Control whether to perform IR graph optimization.
+  /// If turned off, the AnalysisConfig will act just like a NativeConfig.
+  ///
+  /// \param x Whether the ir graph optimization is actived.
+  ///
+  void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
+  ///
+  /// \brief A boolean state telling whether the ir graph optimization is
+  /// actived.
+  ///
+  /// \return bool Whether to use ir graph optimization.
+  ///
+  bool ir_optim() const { return enable_ir_optim_; }
+
+  ///
+  /// \brief INTERNAL Determine whether to use the feed and fetch operators.
+  /// Just for internal development, not stable yet.
+  /// When ZeroCopyTensor is used, this should be turned off.
+  ///
+  /// \param x Whether to use the feed and fetch operators.
+  ///
+  void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
+  ///
+  /// \brief A boolean state telling whether to use the feed and fetch
+  /// operators.
+  ///
+  /// \return bool Whether to use the feed and fetch operators.
+  ///
+  bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
+
+  ///
+  /// \brief Control whether to specify the inputs' names.
+  /// The ZeroCopyTensor type has a name member, assign it with the
+  /// corresponding
+  /// variable name. This is used only when the input ZeroCopyTensors passed to
+  /// the
+  /// AnalysisPredictor.ZeroCopyRun() cannot follow the order in the training
+  /// phase.
+  ///
+  /// \param x Whether to specify the inputs' names.
+  ///
+  void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
+  ///
+  /// \brief A boolean state tell whether the input ZeroCopyTensor names
+  /// specified should
+  /// be used to reorder the inputs in AnalysisPredictor.ZeroCopyRun().
+  ///
+  /// \return bool Whether to specify the inputs' names.
+  ///
+  bool specify_input_name() const { return specify_input_name_; }
+
+  ///
+  /// \brief Turn on the TensorRT engine.
+  /// The TensorRT engine will accelerate some subgraphes in the original Fluid
+  /// computation graph. In some models such as resnet50, GoogleNet and so on,
+  /// it gains significant performance acceleration.
+  ///
+  /// \param workspace_size The memory size(in byte) used for TensorRT
+  /// workspace.
+  /// \param max_batch_size The maximum batch size of this prediction task,
+  /// better set as small as possible for less performance loss.
+  /// \param min_subgrpah_size The minimum TensorRT subgraph size needed, if a
+  /// subgraph is smaller than this, it will not be transferred to TensorRT
+  /// engine.
+  /// \param precision The precision used in TensorRT.
+  /// \param use_static Serialize optimization information to disk for reusing.
+  /// \param use_calib_mode Use TRT int8 calibration(post training
+  /// quantization).
+  ///
+  ///
+  void EnableTensorRtEngine(int workspace_size = 1 << 20,
+                            int max_batch_size = 1, int min_subgraph_size = 3,
+                            Precision precision = Precision::kFloat32,
+                            bool use_static = false,
+                            bool use_calib_mode = true);
+  ///
+  /// \brief A boolean state telling whether the TensorRT engine is used.
+  ///
+  /// \return bool Whether the TensorRT engine is used.
+  ///
+  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
+  ///
+  /// \brief Set min, max, opt shape for TensorRT Dynamic shape mode.
+  /// \param min_input_shape The min input shape of the subgraph input.
+  /// \param max_input_shape The max input shape of the subgraph input.
+  /// \param opt_input_shape The opt input shape of the subgraph input.
+  /// \param disable_trt_plugin_fp16 Setting this parameter to true means that
+  /// TRT plugin will not run fp16.
+  ///
+  void SetTRTDynamicShapeInfo(
+      std::map<std::string, std::vector<int>> min_input_shape,
+      std::map<std::string, std::vector<int>> max_input_shape,
+      std::map<std::string, std::vector<int>> optim_input_shape,
+      bool disable_trt_plugin_fp16 = false);
+  ///
+  /// \brief Turn on the usage of Lite sub-graph engine.
+  ///
+  /// \param precision_mode Precion used in Lite sub-graph engine.
+  /// \param passes_filter Set the passes used in Lite sub-graph engine.
+  /// \param ops_filter Operators not supported by Lite.
+  ///
+  void EnableLiteEngine(
+      AnalysisConfig::Precision precision_mode = Precision::kFloat32,
+      const std::vector<std::string>& passes_filter = {},
+      const std::vector<std::string>& ops_filter = {});
+
+  ///
+  /// \brief A boolean state indicating whether the Lite sub-graph engine is
+  /// used.
+  ///
+  /// \return bool whether the Lite sub-graph engine is used.
+  ///
+  bool lite_engine_enabled() const { return use_lite_; }
+
+  ///
+  /// \brief Control whether to debug IR graph analysis phase.
+  /// This will generate DOT files for visualizing the computation graph after
+  /// each analysis pass applied.
+  ///
+  /// \param x whether to debug IR graph analysis phase.
+  ///
+  void SwitchIrDebug(int x = true);
+
+  ///
+  /// \brief Turn on MKLDNN.
+  ///
+  ///
+  void EnableMKLDNN();
+  ///
+  /// \brief Set the cache capacity of different input shapes for MKLDNN.
+  /// Default value 0 means not caching any shape.
+  ///
+  /// \param capacity The cache capacity.
+  ///
+  void SetMkldnnCacheCapacity(int capacity);
+  ///
+  /// \brief A boolean state telling whether to use the MKLDNN.
+  ///
+  /// \return bool Whether to use the MKLDNN.
+  ///
+  bool mkldnn_enabled() const { return use_mkldnn_; }
+
+  ///
+  /// \brief Set the number of cpu math library threads.
+  ///
+  /// \param cpu_math_library_num_threads The number of cpu math library
+  /// threads.
+  ///
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
+  ///
+  /// \brief An int state telling how many threads are used in the CPU math
+  /// library.
+  ///
+  /// \return int The number of threads used in the CPU math library.
+  ///
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+  ///
+  /// \brief Transform the AnalysisConfig to NativeConfig.
+  ///
+  /// \return NativeConfig The NativeConfig transformed.
+  ///
+  NativeConfig ToNativeConfig() const;
+  ///
+  /// \brief Specify the operator type list to use MKLDNN acceleration.
+  ///
+  /// \param op_list The operator type list.
+  ///
+  void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
+    mkldnn_enabled_op_types_ = op_list;
+  }
+
+  ///
+  /// \brief Turn on MKLDNN quantization.
+  ///
+  ///
+  void EnableMkldnnQuantizer();
+
+  ///
+  /// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+  ///
+  /// \return bool Whether the MKLDNN quantization is enabled.
+  ///
+  bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; }
+
+  ///
+  /// \brief Get MKLDNN quantizer config.
+  ///
+  /// \return MkldnnQuantizerConfig* MKLDNN quantizer config.
+  ///
+  MkldnnQuantizerConfig* mkldnn_quantizer_config() const;
+
+  ///
+  /// \brief Specify the memory buffer of program and parameter.
+  /// Used when model and params are loaded directly from memory.
+  ///
+  /// \param prog_buffer The memory buffer of program.
+  /// \param prog_buffer_size The size of the model data.
+  /// \param params_buffer The memory buffer of the combined parameters file.
+  /// \param params_buffer_size The size of the combined parameters data.
+  ///
+  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
+                      const char* params_buffer, size_t params_buffer_size);
+  ///
+  /// \brief A boolean state telling whether the model is set from the CPU
+  /// memory.
+  ///
+  /// \return bool Whether model and params are loaded directly from memory.
+  ///
+  bool model_from_memory() const { return model_from_memory_; }
+
+  ///
+  /// \brief Turn on memory optimize
+  /// NOTE still in development.
+  ///
+  void EnableMemoryOptim();
+  ///
+  /// \brief A boolean state telling whether the memory optimization is
+  /// activated.
+  ///
+  /// \return bool Whether the memory optimization is activated.
+  ///
+  bool enable_memory_optim() const;
+
+  ///
+  /// \brief Turn on profiling report.
+  /// If not turned on, no profiling report will be generated.
+  ///
+  void EnableProfile();
+  ///
+  /// \brief A boolean state telling whether the profiler is activated.
+  ///
+  /// \return bool Whether the profiler is activated.
+  ///
+  bool profile_enabled() const { return with_profile_; }
+
+  ///
+  /// \brief Mute all logs in Paddle inference.
+  ///
+  void DisableGlogInfo();
+  ///
+  /// \brief A boolean state telling whether logs in Paddle inference are muted.
+  ///
+  /// \return bool Whether logs in Paddle inference are muted.
+  ///
+  bool glog_info_disabled() const { return !with_glog_info_; }
+
+  ///
+  /// \brief Set the AnalysisConfig to be invalid.
+  /// This is to ensure that an AnalysisConfig can only be used in one
+  /// AnalysisPredictor.
+  ///
+  void SetInValid() const { is_valid_ = false; }
+  ///
+  /// \brief A boolean state telling whether the AnalysisConfig is valid.
+  ///
+  /// \return bool Whether the AnalysisConfig is valid.
+  ///
+  bool is_valid() const { return is_valid_; }
+
+  friend class ::paddle::AnalysisPredictor;
+
+  ///
+  /// \brief Get a pass builder for customize the passes in IR analysis phase.
+  /// NOTE: Just for developer, not an official API, easy to be broken.
+  ///
+  ///
+  PassStrategy* pass_builder() const;
+  void PartiallyRelease();
+
+ protected:
+  // Update the config.
+  void Update();
+
+  std::string SerializeInfoCache();
+
+ protected:
+  // Model pathes.
+  std::string model_dir_;
+  mutable std::string prog_file_;
+  mutable std::string params_file_;
+
+  // GPU related.
+  bool use_gpu_{false};
+  int device_id_{0};
+  uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
+
+  bool use_cudnn_{false};
+
+  // Padding related
+  bool use_fc_padding_{true};
+
+  // TensorRT related.
+  bool use_tensorrt_{false};
+  // For workspace_size, refer it from here:
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
+  int tensorrt_workspace_size_{1 << 30};
+  // While TensorRT allows an engine optimized for a given max batch size
+  // to run at any smaller size, the performance for those smaller
+  // sizes may not be as well-optimized. Therefore, Max batch is best
+  // equivalent to the runtime batch size.
+  int tensorrt_max_batchsize_{1};
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int tensorrt_min_subgraph_size_{3};
+  Precision tensorrt_precision_mode_{Precision::kFloat32};
+  bool trt_use_static_engine_{false};
+  bool trt_use_calib_mode_{true};
+  std::map<std::string, std::vector<int>> min_input_shape_{};
+  std::map<std::string, std::vector<int>> max_input_shape_{};
+  std::map<std::string, std::vector<int>> optim_input_shape_{};
+  bool disable_trt_plugin_fp16_{false};
+
+  // memory reuse related.
+  bool enable_memory_optim_{false};
+
+  bool use_mkldnn_{false};
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
+
+  bool model_from_memory_{false};
+
+  bool enable_ir_optim_{true};
+  bool use_feed_fetch_ops_{true};
+  bool ir_debug_{false};
+
+  bool specify_input_name_{false};
+
+  int cpu_math_library_num_threads_{1};
+
+  bool with_profile_{false};
+
+  bool with_glog_info_{true};
+
+  // A runtime cache, shouldn't be transferred to others.
+  std::string serialized_info_cache_;
+
+  mutable std::unique_ptr<PassStrategy> pass_builder_;
+
+  bool use_lite_{false};
+  std::vector<std::string> lite_passes_filter_;
+  std::vector<std::string> lite_ops_filter_;
+  Precision lite_precision_mode_;
+
+  // mkldnn related.
+  int mkldnn_cache_capacity_{0};
+  bool use_mkldnn_quantizer_{false};
+  std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
+
+  // If the config is already used on a predictor, it becomes invalid.
+  // Any config can only be used with one predictor.
+  // Variables held by config can take up a lot of memory in some cases.
+  // So we release the memory when the predictor is set up.
+  mutable bool is_valid_{true};
+  std::string opt_cache_dir_;
+};
+
+}  // namespace paddle
diff --git a/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_api.h b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..240ec08b72240713120dc48a7fa6fe630599b058
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_api.h
@@ -0,0 +1,407 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+/*! \file paddle_api.h
+ */
+
+/*! \mainpage Paddle Inference APIs
+ * \section intro_sec Introduction
+ * The Paddle inference library aims to offer an high performance inference SDK
+ * for Paddle users.
+ */
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+/*! \namespace paddle
+ */
+namespace paddle {
+
+/// \brief Paddle data type.
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+  INT32,
+  UINT8,
+  // TODO(Superjomn) support more data types if needed.
+};
+
+/// \brief Memory manager for PaddleTensor.
+///
+/// The PaddleBuf holds a buffer for data input or output. The memory can be
+/// allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+/// should be reused for better performance.
+///
+/// For user allocated memory, the following API can be used:
+/// - PaddleBuf(void* data, size_t length) to set an external memory by
+/// specifying the memory address and length.
+/// - Reset(void* data, size_t length) to reset the PaddleBuf with an external
+/// memory.
+/// ATTENTION, for user allocated memory, deallocation should be done by users
+/// externally after the program finished. The PaddleBuf won't do any allocation
+/// or deallocation.
+///
+/// To have the PaddleBuf allocate and manage the memory:
+/// - PaddleBuf(size_t length) will allocate a memory of size `length`.
+/// - Resize(size_t length) resize the memory to no less than `length`,
+/// ATTENTION
+///  if the allocated memory is larger than `length`, nothing will done.
+///
+/// Usage:
+///
+/// Let PaddleBuf manage the memory internally.
+/// \code{cpp}
+/// const int num_elements = 128;
+/// PaddleBuf buf(num_elements/// sizeof(float));
+/// \endcode
+///
+/// Or
+/// \code{cpp}
+/// PaddleBuf buf;
+/// buf.Resize(num_elements/// sizeof(float));
+/// \endcode
+/// Works the exactly the same.
+///
+/// One can also make the `PaddleBuf` use the external memory.
+/// \code{cpp}
+/// PaddleBuf buf;
+/// void* external_memory = new float[num_elements];
+/// buf.Reset(external_memory, num_elements*sizeof(float));
+/// ...
+/// delete[] external_memory; // manage the memory lifetime outside.
+/// \endcode
+///
+class PaddleBuf {
+ public:
+  ///
+  /// \brief PaddleBuf allocate memory internally, and manage it.
+  ///
+  /// \param[in] length The length of data.
+  ///
+  explicit PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  ///
+  /// \brief Set external memory, the PaddleBuf won't manage it.
+  ///
+  /// \param[in] data The start address of the external memory.
+  /// \param[in] length The length of data.
+  ///
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  ///
+  /// \brief Copy only available when memory is managed externally.
+  ///
+  /// \param[in] other another `PaddleBuf`
+  ///
+  explicit PaddleBuf(const PaddleBuf& other);
+  ///
+  /// \brief Resize the memory.
+  ///
+  /// \param[in] length The length of data.
+  ///
+  void Resize(size_t length);
+  ///
+  /// \brief Reset to external memory, with address and length set.
+  ///
+  /// \param[in] data The start address of the external memory.
+  /// \param[in] length The length of data.
+  ///
+  void Reset(void* data, size_t length);
+  ///
+  /// \brief Tell whether the buffer is empty.
+  ///
+  bool empty() const { return length_ == 0; }
+  ///
+  /// \brief Get the data's memory address.
+  ///
+  void* data() const { return data_; }
+  ///
+  /// \brief Get the memory length.
+  ///
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+  PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+
+ private:
+  void Free();
+  void* data_{nullptr};  ///< pointer to the data memory.
+  size_t length_{0};     ///< number of memory bytes.
+  bool memory_owned_{true};
+};
+
+///
+/// \brief Basic input and output data structure for PaddlePredictor.
+///
+struct PaddleTensor {
+  PaddleTensor() = default;
+  std::string name;  ///<  variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  ///<  blob of data.
+  PaddleDType dtype;
+  std::vector<std::vector<size_t>> lod;  ///<  Tensor+LoD equals LoDTensor
+};
+
+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+
+/// \brief Represents an n-dimensional array of values.
+/// The ZeroCopyTensor is used to store the input or output of the network.
+/// Zero copy means that the tensor supports direct copy of host or device data
+/// to device,
+/// eliminating additional CPU copy. ZeroCopyTensor is only used in the
+/// AnalysisPredictor.
+/// It is obtained through PaddlePredictor::GetinputTensor()
+/// and PaddlePredictor::GetOutputTensor() interface.
+class ZeroCopyTensor {
+ public:
+  /// \brief Reset the shape of the tensor.
+  /// Generally it's only used for the input tensor.
+  /// Reshape must be called before calling mutable_data() or copy_from_cpu()
+  /// \param shape The shape to set.
+  void Reshape(const std::vector<int>& shape);
+
+  /// \brief Get the memory pointer in CPU or GPU with specific data type.
+  /// Please Reshape the tensor first before call this.
+  /// It's usually used to get input data pointer.
+  /// \param place The place of the tensor.
+  template <typename T>
+  T* mutable_data(PaddlePlace place);
+
+  /// \brief Get the memory pointer directly.
+  /// It's usually used to get the output data pointer.
+  /// \param[out] place To get the device type of the tensor.
+  /// \param[out] size To get the data size of the tensor.
+  /// \return The tensor data buffer pointer.
+  template <typename T>
+  T* data(PaddlePlace* place, int* size) const;
+
+  /// \brief Copy the host memory to tensor data.
+  /// It's usually used to set the input tensor data.
+  /// \param data The pointer of the data, from which the tensor will copy.
+  template <typename T>
+  void copy_from_cpu(const T* data);
+
+  /// \brief Copy the tensor data to the host memory.
+  /// It's usually used to get the output tensor data.
+  /// \param[out] data The tensor will copy the data to the address.
+  template <typename T>
+  void copy_to_cpu(T* data);
+
+  /// \brief Return the shape of the Tensor.
+  std::vector<int> shape() const;
+
+  /// \brief Set lod info of the tensor.
+  /// More about LOD can be seen here:
+  ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
+  /// \param x the lod info.
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  /// \brief Return the lod info of the tensor.
+  std::vector<std::vector<size_t>> lod() const;
+  /// \brief Return the name of the tensor.
+  const std::string& name() const { return name_; }
+  void SetPlace(PaddlePlace place, int device = -1) {
+    place_ = place;
+    device_ = device;
+  }
+
+  /// \brief Return the data type of the tensor.
+  /// It's usually used to get the output tensor data type.
+  /// \return The data type of the tensor.
+  PaddleDType type() const;
+
+ protected:
+  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
+  void SetName(const std::string& name) { name_ = name; }
+  void* FindTensor() const;
+
+ private:
+  std::string name_;
+  bool input_or_output_;
+  friend class AnalysisPredictor;
+  void* scope_{nullptr};
+  // The corresponding tensor pointer inside Paddle workspace is cached for
+  // performance.
+  mutable void* tensor_{nullptr};
+  PaddlePlace place_;
+  PaddleDType dtype_;
+  int device_;
+};
+
+/// \brief A Predictor for executing inference on a model.
+/// Base class for AnalysisPredictor and NativePaddlePredictor.
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  /// \brief This interface takes input and runs the network.
+  /// There are redundant copies of data between hosts in this operation,
+  /// so it is more recommended to use the zecopyrun interface
+  /// \param[in] inputs An list of PaddleTensor as the input to the network.
+  /// \param[out] output_data Pointer to the tensor list, which holds the output
+  /// paddletensor
+  /// \param[in] batch_size This setting has been discarded and can be ignored.
+  /// \return Whether the run is successful
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
+
+  /// \brief  Used to get the name of the network input.
+  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
+  /// \return Input tensor names.
+  virtual std::vector<std::string> GetInputNames() { return {}; }
+
+  /// \brief Get the input shape of the model.
+  /// \return A map contains all the input names and shape defined in the model.
+  virtual std::map<std::string, std::vector<int64_t>> GetInputTensorShape() {
+    return {};
+  }
+
+  /// \brief Used to get the name of the network output.
+  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
+  /// \return Output tensor names.
+  virtual std::vector<std::string> GetOutputNames() { return {}; }
+
+  /// \brief Get the input ZeroCopyTensor by name.
+  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
+  /// The name is obtained from the GetInputNames() interface.
+  /// \param name The input tensor name.
+  /// \return Return the corresponding input ZeroCopyTensor.
+  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+
+  /// \brief Get the output ZeroCopyTensor by name.
+  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
+  /// The name is obtained from the GetOutputNames() interface.
+  /// \param name The output tensor name.
+  /// \return Return the corresponding output ZeroCopyTensor.
+  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  /// \brief Run the network with zero-copied inputs and outputs.
+  /// Be inherited by AnalysisPredictor and only used in ZeroCopy scenarios.
+  /// This will save the IO copy for transfering inputs and outputs to predictor
+  /// workspace
+  /// and get some performance improvement.
+  /// To use it, one should call the AnalysisConfig.SwitchUseFeedFetchOp(true)
+  /// and then use the `GetInputTensor` and `GetOutputTensor`
+  /// to directly write or read the input/output tensors.
+  /// \return Whether the run is successful
+  virtual bool ZeroCopyRun() { return false; }
+
+  /// \brief Clone an existing predictor
+  /// When using clone, the same network will be created,
+  /// and the parameters between them are shared.
+  /// \return unique_ptr which contains the pointer of predictor
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
+
+  /// \brief Destroy the Predictor.
+  virtual ~PaddlePredictor() = default;
+
+  virtual std::string GetSerializedProgram() const {
+    assert(false);  // Force raise error.
+    return "NotImplemented";
+  }
+
+  /// \brief Base class for NativeConfig and AnalysisConfig.
+  struct Config {
+    std::string model_dir; /*!< path to the model directory. */
+  };
+};
+
+///
+/// \brief configuration manager for `NativePredictor`.
+///
+/// `AnalysisConfig` manages configurations of `NativePredictor`.
+/// During inference procedure, there are many parameters(model/params path,
+/// place of inference, etc.)
+///
+struct NativeConfig : public PaddlePredictor::Config {
+  /// GPU related fields.
+  bool use_gpu{false};
+  int device{0};
+  float fraction_of_gpu_memory{
+      -1.f};  ///< Change to a float in (0,1] if needed.
+
+  std::string prog_file;
+  std::string
+      param_file;  ///< Specify the exact path of program and parameter files.
+
+  bool specify_input_name{false};  ///< Specify the variable's name of each
+                                   ///< input if input tensors don't follow the
+                                   ///< `feeds` and `fetches` of the phase
+                                   ///< `save_inference_model`.
+
+  /// Set and get the number of cpu math library threads.
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
+    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+  }
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+ protected:
+  int cpu_math_library_num_threads_{1};  ///< number of cpu math library (such
+                                         ///< as MKL, OpenBlas) threads for each
+                                         ///< instance.
+};
+
+///
+/// \brief A factory to help create different predictors.
+///
+/// Usage:
+///
+/// \code{.cpp}
+/// NativeConfig config;
+/// ... // change the configs.
+/// auto native_predictor = CreatePaddlePredictor(config);
+/// \endcode
+///
+/// FOR EXTENSION DEVELOPER:
+/// Different predictors are designated by config type. Similar configs can be
+/// merged, but there shouldn't be a huge config containing different fields for
+/// more than one kind of predictors.
+////
+template <typename ConfigT>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+/// NOTE The following APIs are too trivial, we will discard it in the following
+/// versions.
+///
+enum class PaddleEngineKind {
+  kNative = 0,         ///< Use the native Fluid facility.
+  kAutoMixedTensorRT,  ///< Automatically mix Fluid with TensorRT.
+  kAnalysis,           ///< More optimization.
+};
+
+template <typename ConfigT, PaddleEngineKind engine>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+int PaddleDtypeSize(PaddleDType dtype);
+
+std::string get_version();
+
+}  // namespace paddle
diff --git a/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_inference_api.h b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_inference_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f30ad95f168cebe9702c90fbd2cca2c79a0e83f
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_inference_api.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle_analysis_config.h"  // NOLINT
+#include "paddle_api.h"              // NOLINT
diff --git a/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_mkldnn_quantizer_config.h b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_mkldnn_quantizer_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ddbef78f9d4cbd0ddd282cbb7e82fd4fcb444e4
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_mkldnn_quantizer_config.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file paddle_mkldnn_quantizer_config.h
+///
+/// \brief Mkldnn quantizer config.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-01-01
+/// \since 1.7.0
+///
+
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle_api.h"  // NOLINT
+
+namespace paddle {
+
+///
+/// \brief Algorithms for finding scale of quantized Tensors.
+///
+enum class ScaleAlgo {
+  NONE,      ///< Do not compute scale
+  MAX,       ///< Find scale based on the max absolute value
+  MAX_CH,    ///< Find scale based on the max absolute value per output channel
+  MAX_CH_T,  ///< Find scale based on the max absolute value per output channel
+             ///< of a transposed tensor
+  KL,        ///< Find scale based on KL Divergence
+};
+
+///
+/// \class MkldnnQuantizerConfig
+///
+/// \brief Config for mkldnn quantize.
+///
+/// The MkldnnQuantizerConfig is used to configure Mkldnn's quantization
+/// parameters, including scale algorithm, warmup data, warmup batch size,
+/// quantized op list, etc.
+///
+/// It is not recommended to use this config directly, please refer to
+/// AnalysisConfig::mkldnn_quantizer_config()
+///
+struct MkldnnQuantizerConfig {
+  ///
+  /// \brief Construct a new Mkldnn Quantizer Config object
+  ///
+  MkldnnQuantizerConfig();
+
+  ///
+  /// \brief Set the scale algo
+  ///
+  /// Specify a quantization algorithm for a connection (input/output) of the
+  /// operator type.
+  /// \param[in] op_type_name the operator's name.
+  /// \param[in] conn_name name of the connection (input/output) of the
+  /// operator.
+  /// \param[in] algo the algorithm for computing scale.
+  ///
+  void SetScaleAlgo(std::string op_type_name, std::string conn_name,
+                    ScaleAlgo algo) {
+    rules_[op_type_name][conn_name] = algo;
+  }
+
+  ///
+  /// \brief Get the scale algo
+  ///
+  /// Get the quantization algorithm for a connection (input/output) of the
+  /// operator type.
+  ///
+  /// \param[in] op_type_name the operator's name.
+  /// \param[in] conn_name name of the connection (input/output) of the
+  /// operator.
+  /// \return the scale algo.
+  ///
+  ScaleAlgo scale_algo(const std::string& op_type_name,
+                       const std::string& conn_name) const;
+
+  ///
+  /// \brief Set the warmup data
+  ///
+  /// Set the batch of data to be used for warm-up iteration.
+  ///
+  /// \param[in] data batch of data.
+  ///
+  void SetWarmupData(std::shared_ptr<std::vector<PaddleTensor>> data) {
+    warmup_data_ = data;
+  }
+
+  ///
+  /// \brief Get the warmup data
+  ///
+  /// Get the batch of data used for warm-up iteration.
+  ///
+  /// \return the warm up data
+  ///
+  std::shared_ptr<std::vector<PaddleTensor>> warmup_data() const {
+    return warmup_data_;
+  }
+
+  ///
+  /// \brief Set the warmup batch size
+  ///
+  /// Set the batch size for warm-up iteration.
+  ///
+  /// \param[in] batch_size warm-up batch size
+  ///
+  void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; }
+
+  ///
+  /// \brief Get the warmup batch size
+  ///
+  /// Get the batch size for warm-up iteration.
+  ///
+  /// \return the warm up batch size
+  int warmup_batch_size() const { return warmup_bs_; }
+
+  ///
+  /// \brief Set quantized op list
+  ///
+  /// In the quantization process, set the op list that supports quantization
+  ///
+  /// \param[in] op_list List of quantized ops
+  ///
+  void SetEnabledOpTypes(std::unordered_set<std::string> op_list) {
+    enabled_op_types_ = op_list;
+  }
+
+  ///
+  /// \brief Get quantized op list
+  ///
+  /// \return list of quantized ops
+  ///
+  const std::unordered_set<std::string>& enabled_op_types() const {
+    return enabled_op_types_;
+  }
+
+  ///
+  /// \brief Set the excluded op ids
+  ///
+  /// \param[in] op_ids_list excluded op ids
+  ///
+  void SetExcludedOpIds(std::unordered_set<int> op_ids_list) {
+    excluded_op_ids_ = op_ids_list;
+  }
+
+  ///
+  /// \brief Get the excluded op ids
+  ///
+  /// \return exclude op ids
+  ///
+  const std::unordered_set<int>& excluded_op_ids() const {
+    return excluded_op_ids_;
+  }
+
+  ///
+  /// \brief Set default scale algorithm
+  ///
+  /// \param[in] algo Method for calculating scale in quantization process
+  ///
+  void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; }
+
+  ///
+  /// \brief Get default scale algorithm
+  ///
+  /// \return Method for calculating scale in quantization
+  /// process
+  ///
+  ScaleAlgo default_scale_algo() const { return default_scale_algo_; }
+
+ protected:
+  std::map<std::string, std::map<std::string, ScaleAlgo>> rules_;
+  std::unordered_set<std::string> enabled_op_types_;
+  std::unordered_set<int> excluded_op_ids_;
+  std::shared_ptr<std::vector<PaddleTensor>> warmup_data_;
+  int warmup_bs_{1};
+  ScaleAlgo default_scale_algo_{ScaleAlgo::MAX};
+};
+
+}  // namespace paddle
diff --git a/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_pass_builder.h b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_pass_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..bce463182d50969656b08aa0055dc18992b30688
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/paddle_include_file/paddle_pass_builder.h
@@ -0,0 +1,221 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+///
+/// \file paddle_pass_builder.h
+///
+/// \brief Class Paddle Passs Builder and its subclasses(pass strategies).
+/// \section sec_intro Introduction
+/// This class aims to build passes for paddle and define passes' strategies.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-3-23
+/// \since 1.7
+
+/// \namespace paddle
+namespace paddle {
+
+/// \class PaddlePassBuilder
+/// \brief This class build passes based on vector<string> input. It is part of
+/// inference API. Users can build passes, insert new passes, delete passes
+/// using this class and its functions.
+///
+/// Example Usage:
+///     Build a new pass.
+/// \code{cpp}
+/// const vector<string> passes(1, "conv_relu_mkldnn_fuse_pass");
+/// PaddlePassBuilder builder(passes);
+/// \endcode
+class PaddlePassBuilder {
+ public:
+  /// \brief Constructor of the class. It stores the input passes.
+  /// \param[in] passes passes' types.
+  explicit PaddlePassBuilder(const std::vector<std::string> &passes)
+      : passes_(passes) {}
+
+  /// \brief Stores the input passes.
+  /// \param[in] passes passes' types.
+  void SetPasses(std::initializer_list<std::string> passes) {
+    passes_ = passes;
+  }
+
+  /// \brief Append a pass to the end of the passes.
+  /// \param[in] pass_type the type of the new pass.
+  void AppendPass(const std::string &pass_type);
+
+  /// \brief Insert a pass to a specific position.
+  /// \param[in] idx the position to insert.
+  /// \param[in] pass_type the type of insert pass.
+  void InsertPass(size_t idx, const std::string &pass_type);
+
+  /// \brief Delete the pass at certain position 'idx'.
+  /// \param[in] idx the position to delete.
+  void DeletePass(size_t idx);
+
+  /// \brief Delete all passes that has a certain type 'pass_type'.
+  /// \param[in] pass_type the certain pass type to be deleted.
+  void DeletePass(const std::string &pass_type);
+
+  /// \brief Delete all the passes.
+  void ClearPasses();
+
+  /// \brief Append an analysis pass.
+  /// \param[in] pass the type of the new analysis pass.
+  void AppendAnalysisPass(const std::string &pass);
+
+  /// \brief Visualize the computation graph after each pass by generating a DOT
+  /// language file, one can draw them with the Graphviz toolkit.
+  void TurnOnDebug();
+  /// \brief Human-readable information of the passes.
+  std::string DebugString();
+
+  /// \brief Get information of passes.
+  /// \return Return list of the passes.
+  const std::vector<std::string> &AllPasses() const { return passes_; }
+
+  /// \brief Get information of analysis passes.
+  /// \return Return list of analysis passes.
+  std::vector<std::string> AnalysisPasses() const {
+    auto passes = analysis_passes_;
+    // To make sure the ir_graph_to_program should be the last pass so any
+    // modication of IR will persist to the program.
+    passes.push_back("ir_graph_to_program_pass");
+    return passes;
+  }
+
+ protected:
+  /// \cond Protected
+  std::vector<std::string> analysis_passes_{
+      {"ir_graph_build_pass", "ir_graph_clean_pass", "ir_analysis_pass",
+       "ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass",
+       "inference_op_replace_pass"}};
+  std::vector<std::string> passes_;
+  /// \endcond
+};
+
+/// \class PassStrategy
+/// \brief This class defines the pass strategies like whether to use gpu/cuDNN
+/// kernel/MKLDNN.
+class PassStrategy : public PaddlePassBuilder {
+ public:
+  /// \brief Constructor of PassStrategy class. It works the same as
+  /// PaddlePassBuilder class. \param[in] passes passes' types.
+  explicit PassStrategy(const std::vector<std::string> &passes)
+      : PaddlePassBuilder(passes) {}
+
+  /// \brief Enable the use of cuDNN kernel.
+  virtual void EnableCUDNN() {}
+
+  /// \brief Enable the use of MKLDNN.
+  /// The MKLDNN control exists in both CPU and GPU mode, because there can
+  /// still be some CPU kernels running in GPU mode.
+  virtual void EnableMKLDNN() {}
+
+  /// \brief Enable MKLDNN quantize optimization.
+  virtual void EnableMkldnnQuantizer() {}
+
+  /// \brief Check if we are using gpu.
+  /// \return A bool variable implying whether we are in gpu mode.
+  bool use_gpu() const { return use_gpu_; }
+
+  /// \brief Default destructor.
+  virtual ~PassStrategy() = default;
+
+ protected:
+  /// \cond Protected
+  bool use_gpu_{false};
+  bool use_mkldnn_{false};
+  /// \endcond
+};
+
+/// \class CpuPassStrategy
+/// \brief The CPU passes controller, it is used in AnalysisPredictor with CPU
+/// mode.
+class CpuPassStrategy : public PassStrategy {
+ public:
+  /// \brief Default constructor of CpuPassStrategy.
+  CpuPassStrategy();
+
+  /// \brief Construct by copying another CpuPassStrategy object.
+  /// \param[in] other The CpuPassStrategy object we want to copy.
+  explicit CpuPassStrategy(const CpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {
+    use_gpu_ = other.use_gpu_;
+    use_mkldnn_ = other.use_mkldnn_;
+    use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
+  }
+  /// \brief Default destructor.
+  virtual ~CpuPassStrategy() = default;
+
+  /// \brief Enable the use of cuDNN kernel.
+  void EnableCUDNN() override;
+
+  /// \brief Enable the use of MKLDNN.
+  void EnableMKLDNN() override;
+
+  /// \brief Enable MKLDNN quantize optimization.
+  void EnableMkldnnQuantizer() override;
+
+ protected:
+  /// \cond Protected
+  bool use_mkldnn_quantizer_{false};
+  /// \endcond
+};
+
+/// \class GpuPassStrategy
+/// \brief The GPU passes controller, it is used in AnalysisPredictor with GPU
+/// mode.
+class GpuPassStrategy : public PassStrategy {
+ public:
+  /// \brief Default constructor of GpuPassStrategy.
+  GpuPassStrategy();
+
+  /// \brief Construct by copying another GpuPassStrategy object.
+  /// \param[in] other The GpuPassStrategy object we want to copy.
+  explicit GpuPassStrategy(const GpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {
+    use_gpu_ = true;
+    use_cudnn_ = other.use_cudnn_;
+  }
+
+  /// \brief Enable the use of cuDNN kernel.
+  void EnableCUDNN() override;
+
+  /// \brief Not supported in GPU mode yet.
+  void EnableMKLDNN() override;
+
+  /// \brief Not supported in GPU mode yet.
+  void EnableMkldnnQuantizer() override;
+
+  /// \brief Default destructor.
+  virtual ~GpuPassStrategy() = default;
+
+ protected:
+  /// \cond Protected
+  bool use_cudnn_{false};
+  /// \endcond
+};
+/// \brief List of tensorRT subgraph passes.
+extern const std::vector<std::string> kTRTSubgraphPasses;
+
+/// \brief List of lite subgraph passes.
+extern const std::vector<std::string> kLiteSubgraphPasses;
+
+}  // namespace paddle
diff --git a/doc/fluid/Paddle-Inference/docs/requirements.txt b/doc/fluid/Paddle-Inference/docs/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..48b25191d79908844c5523b36bd82134d4edd5ca
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/requirements.txt
@@ -0,0 +1,6 @@
+breathe==4.18.1
+sphinx==3.0.3
+recommonmark
+sphinx_markdown_tables==0.0.14
+sphinx_rtd_theme==0.4.3
+exhale==0.2.3
diff --git a/doc/fluid/Paddle-Inference/docs/tools/visual.rst b/doc/fluid/Paddle-Inference/docs/tools/visual.rst
new file mode 100644
index 0000000000000000000000000000000000000000..859a252534df3ef950866846c2cae4c9945b78c1
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/tools/visual.rst
@@ -0,0 +1,82 @@
+模型可视化
+==============
+
+通过 `Quick Start <../introduction/quick_start.html>`_ 一节中，我们了解到，预测模型包含了两个文件，一部分为模型结构文件，通常以 **model** 或 **__model__** 文件存在；另一部分为参数文件，通常以params 文件或一堆分散的文件存在。
+
+模型结构文件，顾名思义，存储了模型的拓扑结构，其中包括模型中各种OP的计算顺序以及OP的详细信息。很多时候，我们希望能够将这些模型的结构以及内部信息可视化，方便我们进行模型分析。接下来将会通过两种方式来讲述如何对Paddle 预测模型进行可视化。
+
+一： 通过 VisualDL 可视化
+------------------
+
+1） 安装
+
+VisualDL是飞桨可视化分析工具，以丰富的图表呈现训练参数变化趋势、模型结构、数据样本、高维数据分布等，帮助用户更清晰直观地理解深度学习模型训练过程及模型结构，实现高效的模型优化。
+我们可以进入 `GitHub主页 <https://github.com/PaddlePaddle/VisualDL#%E5%AE%89%E8%A3%85%E6%96%B9%E5%BC%8F>`_ 进行下载安装。
+
+2）可视化
+
+`点击 <https://paddle-inference-dist.bj.bcebos.com/temp_data/sample_model/__model__>`_ 下载测试模型。
+
+支持两种启动方式：
+
+- 前端拖拽上传模型文件：
+
+  - 无需添加任何参数，在命令行执行 visualdl 后启动界面上传文件即可：
+
+
+.. image:: https://user-images.githubusercontent.com/48054808/88628504-a8b66980-d0e0-11ea-908b-196d02ed1fa2.png
+
+
+- 后端透传模型文件：
+
+  - 在命令行加入参数 --model 并指定 **模型文件** 路径（非文件夹路径），即可启动：
+
+.. code:: python
+
+  visualdl --model ./log/model --port 8080
+
+
+.. image:: https://user-images.githubusercontent.com/48054808/88621327-b664f280-d0d2-11ea-9e76-e3fcfeea4e57.png
+
+Graph功能详细使用，请见 `Graph使用指南 <https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/components/README.md#Graph--%E7%BD%91%E7%BB%9C%E7%BB%93%E6%9E%84%E7%BB%84%E4%BB%B6>`_ 。
+
+二： 通过代码方式生成dot文件
+---------------------
+
+1）pip 安装Paddle
+
+2）生成dot文件
+
+`点击 <https://paddle-inference-dist.bj.bcebos.com/temp_data/sample_model/__model__>`_ 下载测试模型。
+
+.. code:: python
+
+	#!/usr/bin/env python
+	import paddle.fluid as fluid
+	from paddle.fluid import core
+	from paddle.fluid.framework import IrGraph
+
+	def get_graph(program_path):
+	    with open(program_path, 'rb') as f:
+		    binary_str = f.read()
+	    program =   fluid.framework.Program.parse_from_string(binary_str)
+	    return IrGraph(core.Graph(program.desc), for_test=True)
+
+	if __name__ == '__main__':
+	    program_path = './lecture_model/__model__' 
+	    offline_graph = get_graph(program_path)
+	    offline_graph.draw('.', 'test_model', [])
+
+
+3）生成svg
+
+**Note：需要环境中安装graphviz**
+
+.. code:: python
+
+	dot -Tsvg ./test_mode.dot -o test_model.svg
+	
+
+然后将test_model.svg以浏览器打开预览即可。
+
+.. image::  https://user-images.githubusercontent.com/5595332/81796500-19b59e80-9540-11ea-8c70-31122e969683.png
diff --git a/doc/fluid/Paddle-Inference/docs/tools/x2paddle.rst b/doc/fluid/Paddle-Inference/docs/tools/x2paddle.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c0ef189cc0f7edd25cff4dd6965f73abf074a6cb
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/tools/x2paddle.rst
@@ -0,0 +1,81 @@
+模型转换工具 X2Paddle
+=====================
+
+X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。
+
+`X2Paddle <https://github.com/PaddlePaddle/X2Paddle>`_ 支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。目前X2Paddle支持的模型参考 `x2paddle_model_zoo <https://github.com/PaddlePaddle/X2Paddle/blob/develop/x2paddle_model_zoo.md>`_ 。
+
+
+多框架支持
+----------------
+
+================  =====  ==========  ====
+      模型        caffe  tensorflow  onnx
+================  =====  ==========  ====
+mobilenetv1       Y      Y           F
+mobilenetv2       Y      Y           Y
+resnet18          Y      Y           F
+resnet50          Y      Y           Y
+mnasnet           Y      Y           F
+efficientnet      Y      Y           Y
+squeezenetv1.1    Y      Y           Y
+shufflenet        Y      Y           F
+mobilenet_ssd     Y      Y           F
+mobilenet_yolov3  F      Y           F
+inceptionv4       F      F           F
+mtcnn             Y      Y           F
+facedetection     Y      F           F
+unet              Y      Y           F
+ocr_attention     F      F           F
+vgg16             F      F           F
+================  =====  ==========  ====
+
+
+安装
+---------------
+
+.. code:: shell
+
+	pip install x2paddle
+
+
+安装最新版本，可使用如下安装方式
+
+.. code:: shell
+
+	pip install git+https://github.com/PaddlePaddle/X2Paddle.git@develop
+
+使用
+------------
+
+Caffe
+>>>>>>>>>>>>>>
+
+.. code:: shell
+
+	x2paddle --framework caffe \
+		--prototxt model.proto \
+		--weight model.caffemodel \
+		--save_dir paddle_model
+
+TensorFlow
+>>>>>>>>>>
+
+.. code:: shell
+
+	x2paddle --framework tensorflow \
+		--model model.pb \
+		--save_dir paddle_model
+
+
+转换结果说明
+--------------
+
+在指定的 `save_dir` 下生成两个目录  
+
+1. inference_model : 模型结构和参数均序列化保存的模型格式
+2. model_with_code : 保存了模型参数文件和模型的python代码
+
+**问题反馈**
+
+X2Paddle使用时存在问题时，欢迎您将问题或Bug报告以 `Github Issues <https://github.com/PaddlePaddle/X2Paddle/issues>`_ 的形式提交给我们，我们会实时跟进。
diff --git a/doc/fluid/Paddle-Inference/docs/user_guides/cxx_api.rst b/doc/fluid/Paddle-Inference/docs/user_guides/cxx_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..22d62600d08aa29b77b5f8018927bb2cdfb9feea
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/user_guides/cxx_api.rst
@@ -0,0 +1,245 @@
+使用C++预测
+==========
+为了简单方便地进行推理部署，飞桨提供了一套高度优化的C++ API推理接口。下面对各主要API使用方法进行详细介绍。    
+
+在 `使用流程 <./tutorial.html>`_ 一节中，我们了解到Paddle Inference预测包含了以下几个方面：
+
+- 配置推理选项
+- 创建predictor
+- 准备模型输入
+- 模型推理
+- 获取模型输出
+
+那我们先用一个简单的程序介绍这一过程：
+
+.. code:: c++
+
+	std::unique_ptr<paddle::PaddlePredictor> CreatePredictor() {
+		// 通过AnalysisConfig配置推理选项
+		AnalysisConfig config;
+		config.SetModel(“./resnet50/model”,
+	                     "./resnet50/params");
+		config.EnableUseGpu(100, 0);
+		config.SwitchUseFeedFetchOps(false);
+		config.EnableMKLDNN();
+		config.EnableMemoryOptim();
+		// 创建predictor
+		return CreatePaddlePredictor(config);
+	}
+	
+	void Run(paddle::PaddlePredictor *predictor,
+			const std::vector<float>& input,
+			const std::vector<int>& input_shape, 
+			std::vector<float> *out_data) {
+		// 准备模型的输入
+		int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+	
+		auto input_names = predictor->GetInputNames();
+		auto input_t = predictor->GetInputTensor(input_names[0]);
+		input_t->Reshape(input_shape);
+		input_t->copy_from_cpu(input.data());
+		// 模型推理
+		CHECK(predictor->ZeroCopyRun());
+	  
+		// 获取模型的输出
+		auto output_names = predictor->GetOutputNames();
+		// there is only one output of Resnet50
+		auto output_t = predictor->GetOutputTensor(output_names[0]);
+		std::vector<int> output_shape = output_t->shape();
+		int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+		out_data->resize(out_num);
+		output_t->copy_to_cpu(out_data->data());
+	}
+
+
+以上的程序中 **CreatePredictor** 函数对推理过程进行了配置以及创建了Predictor。 **Run** 函数进行了输入数据的准备、模型推理以及输出数据的获取过程。
+
+接下来我们依次对程序中出现的AnalysisConfig，Predictor，模型输入，模型输出做一个详细的介绍。
+
+一：关于AnalysisConfig
+------------------
+
+AnalysisConfig管理AnalysisPredictor的推理配置，提供了模型路径设置、推理引擎运行设备选择以及多种优化推理流程的选项。配置中包括了必选配置以及可选配置。 
+
+1. 必选配置
+>>>>>>>>>>>>
+
+**a. 设置模型和参数路径**   
+
+从磁盘加载模型时，根据模型和参数文件存储方式不同，设置AnalysisConfig加载模型和参数的路径有两种形式：
+
+* **non-combined形式** ：模型文件夹model_dir下存在一个模型文件和多个参数文件时，传入模型文件夹路径，模型文件名默认为__model__。 使用方式为： `config->SetModel("./model_dir")`;。
+* **combined形式** ：模型文件夹model_dir下只有一个模型文件`model`和一个参数文件params时，传入模型文件和参数文件路径。 使用方式为： `config->SetModel("./model_dir/model", "./model_dir/params");`。
+* 内存加载模式：如果模型是从内存加载(模型必须为combined形式)，可以使用
+
+.. code:: c++
+
+	std::ifstream in_m(FLAGS_dirname + "/model");
+	std::ifstream in_p(FLAGS_dirname + "/params");
+	std::ostringstream os_model, os_param;
+	os_model << in_m.rdbuf();
+	os_param << in_p.rdbuf();
+	config.SetModelBuffer(os_model.str().data(), os_model.str().size(), os_param.str().data(), os_param.str().size());
+
+Paddle Inference有两种格式的模型，分别为 **non-combined** 以及 **combined** 。这两种类型我们在 `Quick Start <../introduction/quick_start.html>`_ 一节中提到过，忘记的同学可以回顾下。
+
+**b. 关闭Feed，Fetch op** 
+
+config->SwitchUseFeedFetchOps(false);  // 关闭feed和fetch OP使用，使用ZeroCopy接口必须设置此项`
+
+我们用一个小的例子来说明我们为什么要关掉它们。  
+假设我们有一个模型，模型运行的序列为:
+**input -> FEED_OP -> feed_out -> CONV_OP -> conv_out -> FETCH_OP -> output**                   
+
+序列中大些字母的FEED_OP, CONV_OP, FETCH_OP 为模型中的OP， 小写字母的input，feed_out，output 为模型中的变量。                      
+
+在ZeroCopy模式下，我们通过 	`predictor->GetInputTensor(input_names[0])` 获取的模型输入为FEED_OP的输出， 即feed_out，我们通过 `predictor->GetOutputTensor(output_names[0])` 接口获取的模型输出为FETCH_OP的输入，即conv_out，这种情况下，我们在运行期间就没有必要运行feed和fetch OP了，因此需要设置 `config->SwitchUseFeedFetchOps(false)` 来关闭feed和fetch op。
+
+
+2. 可选配置
+>>>>>>>>>> 
+
+**a. 加速CPU推理**
+ 
+.. code:: 
+
+	// 开启MKLDNN，可加速CPU推理，要求预测库带MKLDNN功能。
+	config->EnableMKLDNN();	  	  		
+	// 可以设置CPU数学库线程数math_threads，可加速推理。
+	// 注意：math_threads * 外部线程数 需要小于总的CPU的核心数目，否则会影响预测性能。
+	config->SetCpuMathLibraryNumThreads(10); 
+
+
+**b. 使用GPU推理**
+
+.. code:: 
+
+	// EnableUseGpu后，模型将运行在GPU上。
+	// 第一个参数表示预先分配显存数目，第二个参数表示设备的ID。
+	config->EnableUseGpu(100, 0); 
+
+
+如果使用的预测lib带Paddle-TRT子图功能，可以打开TRT选项进行加速, 详细的请访问 `Paddle-TensorRT文档 <../optimize/paddle_trt.html>`_： 
+
+.. code:: c++
+
+	// 开启TensorRT推理，可提升GPU推理性能，需要使用带TensorRT的推理库
+	config->EnableTensorRtEngine(1 << 30      /*workspace_size*/,   
+								batch_size        /*max_batch_size*/,  
+   								3                 /*min_subgraph_size*/, 
+								AnalysisConfig::Precision::kFloat32 /*precision*/, 
+								false             /*use_static*/, 
+								false             /*use_calib_mode*/);
+
+通过计算图分析，Paddle可以自动将计算图中部分子图融合，并调用NVIDIA的 TensorRT 来进行加速。
+
+
+**c. 内存/显存优化**
+
+.. code:: c++
+
+	config->EnableMemoryOptim();  // 开启内存/显存复用
+
+该配置设置后，在模型图分析阶段会对图中的变量进行依赖分类，两两互不依赖的变量会使用同一块内存/显存空间，缩减了运行时的内存/显存占用（模型较大或batch较大时效果显著）。
+
+
+**d. debug开关**
+
+
+.. code:: c++
+	
+	// 该配置设置后，会关闭模型图分析阶段的任何图优化，预测期间运行同训练前向代码一致。
+	config->SwitchIrOptim(false);
+	// 该配置设置后，会在模型图分析的每个阶段后保存图的拓扑信息到.dot文件中，该文件可用graphviz可视化。
+	config->SwitchIrDebug();
+
+
+二：关于PaddlePredictor
+-----------------------
+PaddlePredictor 是在模型上执行推理的预测器，根据AnalysisConfig中的配置进行创建。
+
+
+.. code:: c++
+
+	std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config);
+
+
+CreatePaddlePredictor 期间首先对模型进行加载，并且将模型转换为由变量和运算节点组成的计算图。接下来将进行一系列的图优化，包括OP的横向纵向融合，删除无用节点，内存/显存优化，以及子图（Paddle-TRT）的分析，加速推理性能，提高吞吐。
+
+
+三：输入输出
+--------------------------
+
+1. 准备输入
+>>>>>>>>>>>>>>>>>
+
+**a. 获取模型所有输入的tensor名字**
+
+.. code:: c++
+
+	std::vector<std::string> input_names = predictor->GetInputNames();
+
+**b. 获取对应名字下的tensor**
+
+
+.. code:: c++
+
+	// 获取第0个输入
+	auto input_t = predictor->GetInputTensor(input_names[0]);
+
+**c. 将数据copy到tensor中**
+
+.. code:: c++
+
+	// 在copy前需要设置tensor的shape
+	input_t->Reshape({batch_size, channels, height, width});
+	// tensor会根据上述设置的shape从input_data中拷贝对应数目的数据到tensor中。
+	input_t->copy_from_cpu<float>(input_data /*数据指针*/);
+
+当然我们也可以用mutable_data获取tensor的数据指针:
+
+.. code:: c++
+
+	// 参数可为PaddlePlace::kGPU, PaddlePlace::kCPU
+	float *input_d = input_t->mutable_data<float>(PaddlePlace::kGPU);
+
+
+2. 获取输出
+>>>>>>>>
+
+**a. 获取模型所有输出的tensor名字**
+
+.. code:: c++
+
+	std::vector<std::string> out_names = predictor->GetOutputNames();
+
+**b. 获取对应名字下的tensor**
+
+.. code:: c++
+
+	// 获取第0个输出
+	auto output_t = predictor->GetOutputTensor(out_names[0]);
+
+**c. 将数据copy到tensor中**
+
+.. code:: c++
+
+	std::vector<float> out_data;
+	// 获取输出的shpae
+	std::vector<int> output_shape = output_t->shape();
+	int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, 	std::multiplies<int>());
+	out_data->resize(out_num);
+	output_t->copy_to_cpu(out_data->data());
+
+
+我们可以用data接口获取tensor的数据指针：
+
+.. code:: c++
+
+	// 参数可为PaddlePlace::kGPU, PaddlePlace::kCPU
+	int output_size;
+	float *output_d = output_t->data<float>(PaddlePlace::kGPU, &output_size);
+
+**下一步**
+
+看到这里您是否已经对Paddle Inference的C++使用有所了解了呢？请访问 `这里 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/c%2B%2B>`_ 进行样例测试。
diff --git a/doc/fluid/Paddle-Inference/docs/user_guides/inference_python_api.rst b/doc/fluid/Paddle-Inference/docs/user_guides/inference_python_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ddd34f7c8434c22f77874a961665e362e27cca33
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/user_guides/inference_python_api.rst
@@ -0,0 +1,218 @@
+使用Python预测
+===============
+
+Paddle Inference提供了高度优化的Python 和C++ API预测接口，本篇文档主要介绍Python API，使用C++ API进行预测的文档可以参考可以参考 `这里 <./cxx_api.html>`_ 。
+
+下面是详细的使用说明。
+
+使用Python预测API预测包含以下几个主要步骤：
+
+- 配置推理选项
+- 创建Predictor
+- 准备模型输入
+- 模型推理
+- 获取模型输出
+
+我们先从一个简单程序入手，介绍这一流程：
+
+.. code:: python
+
+	def create_predictor():
+		# 通过AnalysisConfig配置推理选项
+		config = AnalysisConfig("./resnet50/model", "./resnet50/params")
+		config.switch_use_feed_fetch_ops(False)
+		config.enable_use_gpu(100, 0)
+		config.enable_mkldnn()
+		config.enable_memory_optim()
+		predictor = create_paddle_predictor(config)
+		return predictor
+
+	def run(predictor, data):
+		# 准备模型输入
+		input_names = predictor.get_input_names()
+		for i,  name in enumerate(input_names):
+			input_tensor = predictor.get_input_tensor(name)
+			input_tensor.reshape(data[i].shape)
+			input_tensor.copy_from_cpu(data[i].copy())
+
+		# 执行模型推理
+		predictor.zero_copy_run()
+
+		results = []
+		# 获取模型输出
+		output_names = predictor.get_output_names()
+		for i, name in enumerate(output_names):
+			output_tensor = predictor.get_output_tensor(name)
+			output_data = output_tensor.copy_to_cpu()
+			results.append(output_data)
+
+		return results
+
+
+以上的程序中 **create_predictor** 函数对推理过程进行了配置以及创建了Predictor。 **run** 函数进行了输入数据的准备、模型推理以及输出数据的获取过程。
+
+在接下来的部分中，我们会依次对程序中出现的AnalysisConfig，Predictor，模型输入，模型输出进行详细的介绍。
+
+一、推理配置管理器AnalysisConfig
+----------------------------
+AnalysisConfig管理AnalysisPredictor的推理配置，提供了模型路径设置、推理引擎运行设备选择以及多种优化推理流程的选项。配置中包括了必选配置以及可选配置。
+
+1. 必选配置
+>>>>>>>>>>>>
+
+**a.设置模型和参数路径**
+
+* **Non-combined形式**：模型文件夹 model_dir 下存在一个模型文件和多个参数文件时，传入模型文件夹路径，模型文件名默认为__model__。 使用方式为： `config.set_model("./model_dir")`
+
+* Combined形式：模型文件夹 model_dir 下只有一个模型文件 model 和一个参数文件params时，传入模型文件和参数文件路径。使用方式为： `config.set_model("./model_dir/model", "./model_dir/params")`
+
+* 内存加载模式：如果模型是从内存加载，可以使用:
+
+	.. code:: python
+		
+		import os
+		model_buffer = open('./resnet50/model','rb')
+		params_buffer = open('./resnet50/params','rb')
+		model_size = os.fstat(model_buffer.fileno()).st_size
+		params_size = os.fstat(params_buffer.fileno()).st_size
+		config.set_model_buffer(model_buffer.read(), model_size, params_buffer.read(), params_size)
+
+
+关于 non-combined 以及 combined 模型介绍，请参照 `这里 <../introduction/quick_start.html>`_。
+
+**b. 关闭feed与fetch OP**
+
+config.switch_use_feed_fetch_ops(False)  # 关闭feed和fetch OP
+
+2. 可选配置
+>>>>>>>>>
+ 
+**a. 加速CPU推理**
+ 
+.. code:: python
+
+	# 开启MKLDNN，可加速CPU推理，要求预测库带MKLDNN功能。
+	config.enable_mkldnn()	  	  		
+	# 可以设置CPU数学库线程数math_threads，可加速推理。
+	# 注意：math_threads * 外部线程数 需要小于总的CPU的核心数目，否则会影响预测性能。
+	config.set_cpu_math_library_num_threads(10) 
+
+
+**b. 使用GPU推理**
+
+.. code:: python
+
+	# enable_use_gpu后，模型将运行在GPU上。
+	# 第一个参数表示预先分配显存数目，第二个参数表示设备的ID。
+	config.enable_use_gpu(100, 0) 
+
+如果使用的预测lib带Paddle-TRT子图功能，可以打开TRT选项进行加速： 
+
+.. code:: python
+
+
+	# 开启TensorRT推理，可提升GPU推理性能，需要使用带TensorRT的推理库
+	config.enable_tensorrt_engine(1 << 30,    # workspace_size
+			batch_size,    # max_batch_size
+			3,    # min_subgraph_size
+			AnalysisConfig.Precision.Float32,    # precision
+			False,    # use_static
+			False,    # use_calib_mode
+			)
+
+通过计算图分析，Paddle可以自动将计算图中部分子图融合，并调用NVIDIA的 TensorRT 来进行加速。
+使用Paddle-TensorRT 预测的完整方法可以参考 `这里 <../optimize/paddle_trt.html>`_。
+
+
+**c. 内存/显存优化**
+
+.. code:: python
+
+	config.enable_memory_optim()  # 开启内存/显存复用
+
+该配置设置后，在模型图分析阶段会对图中的变量进行依赖分类，两两互不依赖的变量会使用同一块内存/显存空间，缩减了运行时的内存/显存占用（模型较大或batch较大时效果显著）。
+
+
+**d. debug开关**
+
+
+.. code:: python
+
+	# 该配置设置后，会关闭模型图分析阶段的任何图优化，预测期间运行同训练前向代码一致。
+	config.switch_ir_optim(False)
+
+
+.. code:: python
+
+	# 该配置设置后，会在模型图分析的每个阶段后保存图的拓扑信息到.dot文件中，该文件可用graphviz可视化。
+	config.switch_ir_debug(True)
+
+二、预测器PaddlePredictor
+----------------------
+
+PaddlePredictor 是在模型上执行推理的预测器，根据AnalysisConfig中的配置进行创建。
+
+.. code:: python
+	
+	predictor = create_paddle_predictor(config)
+
+
+create_paddle_predictor 期间首先对模型进行加载，并且将模型转换为由变量和运算节点组成的计算图。接下来将进行一系列的图优化，包括OP的横向纵向融合，删除无用节点，内存/显存优化，以及子图（Paddle-TRT）的分析，加速推理性能，提高吞吐。
+
+
+三：输入/输出
+---------------
+
+1.准备输入
+>>>>>>>>>>>>
+
+**a. 获取模型所有输入的Tensor名字**
+
+.. code:: python
+
+	input_names = predictor.get_input_names()
+
+**b. 获取对应名字下的Tensor**
+
+.. code:: python
+
+	# 获取第0个输入
+	input_tensor = predictor.get_input_tensor(input_names[0])
+
+**c. 将输入数据copy到Tensor中**
+
+.. code:: python
+
+	# 在copy前需要设置Tensor的shape
+	input_tensor.reshape((batch_size, channels, height, width))
+	# Tensor会根据上述设置的shape从input_data中拷贝对应数目的数据。input_data为numpy数组。
+	input_tensor.copy_from_cpu(input_data)
+
+
+2.获取输出
+>>>>>>>>>
+
+**a. 获取模型所有输出的Tensor名字**
+
+.. code::python
+
+	output_names = predictor.get_output_names()
+
+**b. 获取对应名字下的Tensor**
+
+.. code:: python
+	
+	# 获取第0个输出
+	output_tensor = predictor.get_output_tensor(ouput_names[0])
+
+**c. 将数据copy到Tensor中**
+
+.. code:: python
+	
+	# output_data为numpy数组
+	output_data = output_tensor.copy_to_cpu()
+
+
+**下一步**
+
+看到这里您是否已经对 Paddle Inference 的 Python API 使用有所了解了呢？请访问 `这里 <https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/python>`_ 进行样例测试。
diff --git a/doc/fluid/Paddle-Inference/docs/user_guides/source_compile.rst b/doc/fluid/Paddle-Inference/docs/user_guides/source_compile.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bcac00b26b95df960ecba9ab39a941a9393f3553
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/user_guides/source_compile.rst
@@ -0,0 +1,271 @@
+源码编译
+========
+
+什么时候需要源码编译？
+--------------
+
+深度学习的发展十分迅速，对科研或工程人员来说，可能会遇到一些需要自己开发op的场景，可以在python层面编写op，但如果对性能有严格要求的话则必须在C++层面开发op，对于这种情况，需要用户源码编译飞桨，使之生效。
+此外对于绝大多数使用C++将模型部署上线的工程人员来说，您可以直接通过飞桨官网下载已编译好的预测库，快捷开启飞桨使用之旅。`飞桨官网 <https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html>`_ 提供了多个不同环境下编译好的预测库。如果用户环境与官网提供环境不一致（如cuda 、cudnn、tensorrt版本不一致等），或对飞桨源代码有修改需求，或希望进行定制化构建，可查阅本文档自行源码编译得到预测库。
+
+编译原理
+---------
+
+**一：目标产物**
+
+飞桨框架的源码编译包括源代码的编译和链接，最终生成的目标产物包括：
+
+ - 含有 C++ 接口的头文件及其二进制库：用于C++环境，将文件放到指定路径即可开启飞桨使用之旅。
+ - Python Wheel 形式的安装包：用于Python环境，此安装包需要参考 `飞桨安装教程 <https://www.paddlepaddle.org.cn/>`_ 进行安装操作。也就是说，前面讲的pip安装属于在线安装，这里属于本地安装。
+
+**二：基础概念**
+
+飞桨主要由C++语言编写，通过pybind工具提供了Python端的接口，飞桨的源码编译主要包括编译和链接两步。
+* 编译过程由编译器完成，编译器以编译单元（后缀名为 .cc 或 .cpp 的文本文件）为单位，将 C++ 语言 ASCII 源代码翻译为二进制形式的目标文件。一个工程通常由若干源码文件组织得到，所以编译完成后，将生成一组目标文件。
+* 链接过程使分离编译成为可能，由链接器完成。链接器按一定规则将分离的目标文件组合成一个能映射到内存的二进制程序文件，并解析引用。由于这个二进制文件通常包含源码中指定可被外部用户复用的函数接口，所以也被称作函数库。根据链接规则不同，链接可分为静态和动态链接。静态链接对目标文件进行归档；动态链接使用地址无关技术，将链接放到程序加载时进行。
+配合包含声明体的头文件（后缀名为 .h 或 .hpp），用户可以复用程序库中的代码开发应用。静态链接构建的应用程序可独立运行，而动态链接程序在加载运行时需到指定路径下搜寻其依赖的二进制库。
+
+**三：编译方式**
+
+飞桨框架的设计原则之一是满足不同平台的可用性。然而，不同操作系统惯用的编译和链接器是不一样的，使用它们的命令也不一致。比如，Linux 一般使用 GNU 编译器套件（GCC），Windows 则使用 Microsoft Visual C++（MSVC）。为了统一编译脚本，飞桨使用了支持跨平台构建的 CMake，它可以输出上述编译器所需的各种 Makefile 或者 Project 文件。    
+为方便编译，框架对常用的CMake命令进行了封装，如仿照 Bazel工具封装了 cc_binary 和 cc_library ，分别用于可执行文件和库文件的产出等，对CMake感兴趣的同学可在 cmake/generic.cmake 中查看具体的实现逻辑。Paddle的CMake中集成了生成python wheel包的逻辑，对如何生成wheel包感兴趣的同学可参考 `相关文档 <https://packaging.python.org/tutorials/packaging-projects/>`_ 。
+
+
+编译步骤
+-----------
+
+飞桨分为 CPU 版本和 GPU 版本。如果您的计算机没有 Nvidia GPU，请选择 CPU 版本构建安装。如果您的计算机含有 Nvidia GPU（ 1.0 且预装有 CUDA / CuDNN，也可选择 GPU 版本构建安装。本节简述飞桨在常用环境下的源码编译方式，欢迎访问飞桨官网获取更详细内容。请阅读本节内容。
+
+**推荐配置及依赖项**
+
+1、稳定的互联网连接，主频 1 GHz 以上的多核处理器，9 GB 以上磁盘空间。  
+2、Python 版本 2.7 或 3.5 以上，pip 版本 9.0 及以上；CMake v3.5 及以上；Git 版本 2.17 及以上。请将可执行文件放入系统环境变量中以方便运行。  
+3、GPU 版本额外需要 Nvidia CUDA 9 / 10，CuDNN v7 及以上版本。根据需要还可能依赖 NCCL 和 TensorRT。  
+
+
+基于Ubuntu 18.04
+------------
+
+**一：环境准备**
+
+除了本节开头提到的依赖，在 Ubuntu 上进行飞桨的源码编译，您还需要准备 GCC8 编译器等工具，可使用下列命令安装：
+
+.. code:: shell
+
+	sudo apt-get install gcc g++ make cmake git vim unrar python3 python3-dev python3-pip swig wget patchelf libopencv-dev
+	pip3 install numpy protobuf wheel setuptools
+
+若需启用 cuda 加速，需准备 cuda、cudnn、nccl。上述工具的安装请参考 nvidia 官网，以 cuda10.1，cudnn7.6 为例配置 cuda 环境。
+
+.. code:: shell
+
+	# cuda
+	sh cuda_10.1.168_418.67_linux.run
+	export PATH=/usr/local/cuda-10.1/bin${PATH:+:${PATH}}
+	export LD_LIBRARY_PATH=/usr/local/cuda-10.1/${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+
+	# cudnn
+	tar -xzvf cudnn-10.1-linux-x64-v7.6.4.38.tgz
+	sudo cp -a cuda/include/cudnn.h /usr/local/cuda/include/
+	sudo cp -a cuda/lib64/libcudnn* /usr/local/cuda/lib64/
+
+	# nccl
+	# install nccl local deb 参考https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html
+	sudo dpkg -i nccl-repo-ubuntu1804-2.5.6-ga-cuda10.1_1-1_amd64.deb
+	# 根据安装提示，还需要执行sudo apt-key add /var/nccl-repo-2.5.6-ga-cuda10.1/7fa2af80.pub
+	sudo apt update
+	sudo apt install libnccl2 libnccl-dev
+
+	sudo ldconfig
+
+
+**编译飞桨过程中可能会打开很多文件，Ubuntu 18.04 默认设置最多同时打开的文件数是1024（参见 ulimit -a），需要更改这个设定值。** 
+
+
+在 /etc/security/limits.conf 文件中添加两行。
+
+.. code:: shell
+ 
+	* hard noopen 102400
+	* soft noopen 102400
+
+重启计算机，重启后执行以下指令，请将${user}切换成当前用户名。
+
+.. code:: shell
+
+	su ${user}
+	ulimit -n 102400
+
+
+**二：编译命令**
+
+使用 Git 将飞桨代码克隆到本地，并进入目录，切换到稳定版本（git tag显示的标签名，如v1.7.1）。  
+**飞桨使用 develop 分支进行最新特性的开发，使用 release 分支发布稳定版本。在 GitHub 的 Releases 选项卡中，可以看到飞桨版本的发布记录。**  
+
+.. code:: shell
+
+	git clone https://github.com/PaddlePaddle/Paddle.git
+	cd Paddle
+	git checkout v1.7.1    
+
+下面以 GPU 版本为例说明编译命令。其他环境可以参考“CMake编译选项表”修改对应的cmake选项。比如，若编译 CPU 版本，请将 WITH_GPU 设置为 OFF。
+
+
+.. code:: shell
+
+	# 创建并进入 build 目录
+	mkdir build_cuda && cd build_cuda
+	# 执行cmake指令
+	cmake -DPY_VERSION=3 \
+		-DWITH_TESTING=OFF \
+		-DWITH_MKL=ON \
+		-DWITH_GPU=ON \
+		-DON_INFER=ON \
+		-DCMAKE_BUILD_TYPE=RelWithDebInfo \
+		..
+		
+**使用make编译**
+
+make -j4
+
+**编译成功后可在dist目录找到生成的.whl包**
+
+pip3 install python/dist/paddlepaddle-1.7.1-cp36-cp36m-linux_x86_64.whl
+
+**预测库编译**
+
+make inference_lib_dist -j4
+
+
+**cmake编译环境表**
+
+以下介绍的编译方法都是通用步骤，根据环境对应修改cmake选项即可。
+
+================  ============================================================================  =============================================================
+      选项                                            说明                                                                 默认值
+================  ============================================================================  =============================================================
+WITH_GPU          是否支持GPU                                                                   ON
+WITH_AVX          是否编译含有AVX指令集的飞桨二进制文件                                         ON
+WITH_PYTHON       是否内嵌PYTHON解释器并编译Wheel安装包                                         ON
+WITH_TESTING      是否开启单元测试                                                              OFF
+WITH_MKL          是否使用MKL数学库，如果为否，将使用OpenBLAS                                   ON
+WITH_SYSTEM_BLAS  是否使用系统自带的BLAS                                                        OFF
+WITH_DISTRIBUTE   是否编译带有分布式的版本                                                      OFF
+WITH_BRPC_RDMA    是否使用BRPC,RDMA作为RPC协议                                                  OFF
+ON_INFER          是否打开预测优化                                                              OFF
+CUDA_ARCH_NAME    是否只针对当前CUDA架构编译                                                    All:编译所有可支持的CUDA架构；Auto:自动识别当前环境的架构编译
+TENSORRT_ROOT     TensorRT_lib的路径，该路径指定后会编译TRT子图功能eg:/paddle/nvidia/TensorRT/  /usr
+================  ============================================================================  =============================================================
+
+基于Windows 10 
+-------------------
+
+**一：环境准备**
+
+除了本节开头提到的依赖，在 Windows 10 上编译飞桨，您还需要准备 Visual Studio 2015 Update3 以上版本。本节以 Visual Studio 企业版 2019（C++ 桌面开发，含 MSVC 14.24）、Python 3.8 为例介绍编译过程。
+
+在命令提示符输入下列命令，安装必需的 Python 组件。
+
+.. code:: shell
+
+	pip3 install numpy protobuf wheel` 
+
+**二：编译命令**
+ 
+使用 Git 将飞桨代码克隆到本地，并进入目录，切换到稳定版本（git tag显示的标签名，如v1.7.1）。  
+**飞桨使用 develop 分支进行最新特性的开发，使用 release 分支发布稳定版本。在 GitHub 的 Releases 选项卡中，可以看到 Paddle 版本的发布记录。**
+
+.. code:: shell
+
+	git clone https://github.com/PaddlePaddle/Paddle.git
+	cd Paddle
+	git checkout v1.7.1
+	
+创建一个构建目录，并在其中执行 CMake，生成解决方案文件 Solution File，以编译 CPU 版本为例说明编译命令，其他环境可以参考“CMake编译选项表”修改对应的cmake选项。
+
+.. code:: shell
+
+	mkdir build
+	cd build
+	cmake .. -G "Visual Studio 16 2019" -A x64 -DWITH_GPU=OFF -DWITH_TESTING=OFF 
+		-DCMAKE_BUILD_TYPE=Release -DPY_VERSION=3
+	
+.. image:: https://agroup-bos.cdn.bcebos.com/1b21aff9424cb33a98f2d1e018d8301614caedda
+
+使用 Visual Studio 打开解决方案文件，在窗口顶端的构建配置菜单中选择 Release x64，单击生成解决方案，等待构建完毕即可。  
+
+**cmake编译环境表**
+
+================  ============================================================================  =============================================================
+      选项                                            说明                                                                 默认值
+================  ============================================================================  =============================================================
+WITH_GPU          是否支持GPU                                                                   ON
+WITH_AVX          是否编译含有AVX指令集的飞桨二进制文件                                         ON
+WITH_PYTHON       是否内嵌PYTHON解释器并编译Wheel安装包                                         ON
+WITH_TESTING      是否开启单元测试                                                              OFF
+WITH_MKL          是否使用MKL数学库，如果为否，将使用OpenBLAS                                   ON
+WITH_SYSTEM_BLAS  是否使用系统自带的BLAS                                                        OFF
+WITH_DISTRIBUTE   是否编译带有分布式的版本                                                      OFF
+WITH_BRPC_RDMA    是否使用BRPC,RDMA作为RPC协议                                                  OFF
+ON_INFER          是否打开预测优化                                                              OFF
+CUDA_ARCH_NAME    是否只针对当前CUDA架构编译                                                    All:编译所有可支持的CUDA架构；Auto:自动识别当前环境的架构编译
+TENSORRT_ROOT     TensorRT_lib的路径，该路径指定后会编译TRT子图功能eg:/paddle/nvidia/TensorRT/  /usr
+================  ============================================================================  =============================================================
+
+**结果验证**
+
+**一：python whl包**
+
+编译完毕后，会在 python/dist 目录下生成一个文件名类似 paddlepaddle-1.7.1-cp36-cp36m-linux_x86_64.whl 的 Python Wheel 安装包，安装测试的命令为：  
+
+.. code:: shell
+
+	pip3 install python/dist/paddlepaddle-1.7.1-cp36-cp36m-linux_x86_64.whl
+
+安装完成后，可以使用 python3 进入python解释器，输入以下指令，出现 `Your Paddle Fluid is installed succesfully! ` ，说明安装成功。
+
+.. code:: python
+
+	import paddle.fluid as fluid
+	fluid.install_check.run_check()
+
+
+**二：c++ lib**
+
+预测库编译后，所有产出均位于build目录下的fluid_inference_install_dir目录内，目录结构如下。version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号。
+
+.. code:: shell
+
+	build/fluid_inference_install_dir
+	├── CMakeCache.txt
+	├── paddle
+	│   ├── include
+	│   │   ├── paddle_anakin_config.h
+	│   │   ├── paddle_analysis_config.h
+	│   │   ├── paddle_api.h
+	│   │   ├── paddle_inference_api.h
+	│   │   ├── paddle_mkldnn_quantizer_config.h
+	│   │   └── paddle_pass_builder.h
+	│   └── lib
+	│       ├── libpaddle_fluid.a (Linux)
+	│       ├── libpaddle_fluid.so (Linux)
+	│       └── libpaddle_fluid.lib (Windows)
+	├── third_party
+	│   ├── boost
+	│   │   └── boost
+	│   ├── eigen3
+	│   │   ├── Eigen
+	│   │   └── unsupported
+	│   └── install
+	│       ├── gflags
+	│       ├── glog
+	│       ├── mkldnn
+	│       ├── mklml
+	│       ├── protobuf
+	│       ├── xxhash
+	│       └── zlib
+	└── version.txt
+
+
+Include目录下包括了使用飞桨预测库需要的头文件，lib目录下包括了生成的静态库和动态库，third_party目录下包括了预测库依赖的其它库文件。
+
+您可以编写应用代码，与预测库联合编译并测试结果。请参 `C++ 预测库 API 使用 <https://aistudio.baidu.com/bjcpu/user/166411/248511/notebooks/248511.ipynb?redirects=1#C++%E9%A2%84%E6%B5%8BAPI>`_ 一节。
diff --git a/doc/fluid/Paddle-Inference/docs/user_guides/tutorial.rst b/doc/fluid/Paddle-Inference/docs/user_guides/tutorial.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f9f79fe7120f7b99ebfad784e8196356adbeb598
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/docs/user_guides/tutorial.rst
@@ -0,0 +1,69 @@
+使用流程
+===========
+
+一： 模型准备
+---------------
+
+Paddle Inference目前支持的模型结构为PaddlePaddle深度学习框架产出的模型格式。因此，在您开始使用 Paddle Inference框架前您需要准备一个由PaddlePaddle框架保存的模型。 如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的，那么我们推荐您使用 X2Paddle 工具进行模型格式转换。
+
+二： 环境准备
+---------------
+
+**1） Python 环境**    
+
+安装Python环境有以下三种方式：
+
+a. 参照 `官方主页 <https://www.paddlepaddle.org.cn/>`_ 的引导进行pip安装。
+ 
+b. 参照接下来的 `预测库编译 <./source_compile.html>`_ 页面进行自行编译。
+ 
+c. 使用docker镜像
+ 
+.. code:: shell
+	
+	# 拉取镜像，该镜像预装Paddle 1.8 Python环境 
+	docker pull hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6
+
+	export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+	export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+	export NVIDIA_SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
+
+	docker run $CUDA_SO $DEVICES $NVIDIA_SMI --name trt_open --privileged --security-opt seccomp=unconfined --net=host -v $PWD:/paddle -it hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6 /bin/bash
+
+**2） C++ 环境**
+
+获取c++预测库有以下三种方式：
+
+a. `官网 <https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html#linux>`_ 下载预编译库
+
+b. 使用docker镜像
+   
+.. code:: shell
+   
+	# 拉取镜像，在容器内主目录～/下存放c++预编译库。
+	docker pull hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6
+
+	export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+	export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+	export NVIDIA_SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
+
+	docker run $CUDA_SO $DEVICES $NVIDIA_SMI --name trt_open --privileged --security-opt seccomp=unconfined --net=host -v $PWD:/paddle -it hub.baidubce.com/paddlepaddle/paddle:1.8.0-gpu-cuda10.0-cudnn7-trt6 /bin/bash
+
+c. 参照接下来的 `预测库编译 <./source_compile.html>`_页面进行自行编译。
+
+三：使用Paddle Inference执行预测
+-----------------
+
+使用Paddle Inference进行推理部署的流程如下所示。  
+
+.. image:: https://ai-studio-static-online.cdn.bcebos.com/10d5cee239374bd59e41283b3233f49dc306109da9d540b48285980810ab4e36
+
+1) 配置推理选项。 **AnalysisConfig** 是飞桨提供的配置管理器API。在使用Paddle Inference进行推理部署过程中，需要使用 **AnalysisConfig** 详细地配置推理引擎参数，包括但不限于在何种设备（CPU/GPU）上部署( **config.EnableUseGPU** )、加载模型路径、开启/关闭计算图分析优化、使用MKLDNN/TensorRT进行部署的加速等。参数的具体设置需要根据实际需求来定。            
+
+2) 创建	 **AnalysisPredictor** 。 **AnalysisPredictor** 是Paddle Inference提供的推理引擎。你只需要简单的执行一行代码即可完成预测引擎的初始化 **std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config)** ，config为1步骤中创建的 **AnalysisConfig**。
+
+3) 准备输入数据。执行 **auto input_names = predictor->GetInputNames()** ，您会获取到模型所有输入tensor的名字，同时通过执行 **auto tensor = predictor->GetInputTensor(input_names[i])** ; 您可以获取第i个输入的tensor，通过 **tensor->copy_from_cpu(data)** 方式，将data中的数据拷贝到tensor中。
+
+4) 调用predictor->ZeroCopyRun()执行推理。           
+
+5) 获取推理输出。执行 **auto out_names = predictor->GetOutputNames()** ，您会获取到模型所有输出tensor的名字，同时通过执行 **auto tensor = predictor->GetOutputTensor(out_names[i])** ; 您可以获取第i个输出的tensor。通过 **tensor->copy_to_cpu(data)** 将tensor中的数据copy到data指针上
diff --git a/doc/fluid/Paddle-Inference/index.html b/doc/fluid/Paddle-Inference/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..974e1209f9475ef0c3930b4039f50735e6e01b6d
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/index.html
@@ -0,0 +1,84 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Paddle Inference Demos
+
+
+
+Paddle Inference为飞桨核心框架推理引擎。Paddle Inference功能特性丰富，性能优异，针对服务器端应用场景进行了深度的适配优化，做到高吞吐、低时延，保证了飞桨模型在服务器端即训即用，快速部署。
+
+
+为了能让广大用户快速的使用Paddle Inference进行部署应用，我们在此Repo中提供了C++、Python的使用样例。
+
+
+**在这个repo中我们会假设您已经对Paddle Inference有了一定的了解。**
+
+**如果您刚刚接触Paddle Inference不久，建议您[访问这里](https://paddle-inference.readthedocs.io/en/latest/#)对Paddle Inference做一个初步的认识。**
+
+
+## 测试样例
+
+1） 在python目录中，我们通过真实输入的方式罗列了一系列的测试样例，其中包括图像的分类，分割，检测，以及NLP的Ernie/Bert等Python使用样例，同时也包含Paddle-TRT， 多线程的使用样例。
+
+2） 在c++目录中，我们通过单测方式展现了一系列的测试样例，其中包括图像的分类，分割，检测，以及NLP的Ernie/Bert等C++使用样例，同时也包含Paddle-TRT， 多线程的使用样例。
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/python/ELMo/README.md b/doc/fluid/Paddle-Inference/python/ELMo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc2bbc1ed0dbad27dc1bb3c76f63d92d8f86bee5
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/ELMo/README.md
@@ -0,0 +1,45 @@
+## 基于ELMo的LAC分词预测样例
+
+### 一：准备环境
+
+请您在环境中安装1.7或以上版本的Paddle，具体的安装方式请参照[飞桨官方页面](https://www.paddlepaddle.org.cn/)的指示方式。
+
+### 二：下载模型以及测试数据
+
+
+1) **获取预测模型**
+
+点击[链接](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/elmo/elmo.tgz)下载模型，如果你想获取更多的**模型训练信息**，请访问[链接](https://github.com/PaddlePaddle/models/tree/release/1.8/PaddleNLP/pretrain_language_models/ELMo)。解压后存储到该工程的根目录。
+
+2) **获取相关数据**
+
+点击[链接](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/elmo/elmo_data.tgz)下载相关数据，解压后存储到该工程的根目录。
+
+### 三：运行预测
+
+`reader.py` 包含了数据读取等功能。
+`infer.py` 包含了创建predictor，读取输入，预测，获取输出的等功能。
+
+运行：
+```
+python infer.py
+```
+
+分词结果为：
+
+```
+1 sample's result: <UNK>/n 电脑/vn 对/v-I 胎儿/v-I 影响/vn-B 大/v-I 吗/a
+2 sample's result: 这个/r 跟/p 我们/ns 一直/p 传承/n 《/p 易经/n 》/n 的/u 精神/n 是/v-I 分/v 不/d 开/v 的/u
+3 sample's result: 他们/p 不/r 但/ad 上/v-I 名医/v-I 门诊/n ,/w 还/n 兼/ns-I 作/ns-I 门诊/n 医生/v-I 的/n 顾问/v-I 团/nt
+4 sample's result: 负责/n 外商/v-I 投资/v-I 企业/n 和/v-I 外国/v-I 企业/n 的/u 税务/nr-I 登记/v-I ,/w 纳税/n 申报/vn 和/n 税收/vn 资料/n 的/u 管理/n ,/w 全面/c 掌握/n 税收/vn 信息/n
+5 sample's result: 采用/ns-I 弹性/ns-I 密封/ns-I 结构/n ,/w 实现/n 零/v-B 间隙/v-I
+6 sample's result: 要/r 做/n 好/p 这/n 三/p 件/vn 事/n ,/w 支行/q 从/q 风险/n 管理/p 到/a 市场/q 营销/n 策划/c 都/p 必须/vn 专业/n 到位/vn
+7 sample's result: 那么/nz-B ,/r 请/v-I 你/v-I 一定/nz-B 要/d-I 幸福/ad ./v-I
+8 sample's result: 叉车/ns-I 在/ns-I 企业/n 的/u 物流/n 系统/vn 中/ns-I 扮演/ns-I 着/v-I 非常/q 重要/n 的/u 角色/n ,/w 是/u 物料/vn 搬运/ns-I 设备/n 中/vn 的/u 主力/ns-I 军/v-I
+9 sample's result: 我/r 真/t 的/u 能够/vn 有/ns-I 机会/ns-I 拍摄/v-I 这部/vn 电视/ns-I 剧/v-I 么/vn
+10 sample's result: 这种/r 情况/n 应该/v-I 是/v-I 没/n 有/p 危害/n 的/u
+```
+
+### 相关链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
diff --git a/doc/fluid/Paddle-Inference/python/ELMo/index.html b/doc/fluid/Paddle-Inference/python/ELMo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..d7b4b160290072afdbfaa8cc23dd39fecfd35d85
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/ELMo/index.html
@@ -0,0 +1,109 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+## 基于ELMo的LAC分词预测样例
+
+### 一：准备环境
+
+请您在环境中安装1.7或以上版本的Paddle，具体的安装方式请参照[飞桨官方页面](https://www.paddlepaddle.org.cn/)的指示方式。
+
+### 二：下载模型以及测试数据
+
+
+1) **获取预测模型**
+
+点击[链接](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/elmo/elmo.tgz)下载模型，如果你想获取更多的**模型训练信息**，请访问[链接](https://github.com/PaddlePaddle/models/tree/release/1.8/PaddleNLP/pretrain_language_models/ELMo)。解压后存储到该工程的根目录。
+
+2) **获取相关数据**
+
+点击[链接](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/elmo/elmo_data.tgz)下载相关数据，解压后存储到该工程的根目录。
+
+### 三：运行预测
+
+`reader.py` 包含了数据读取等功能。
+`infer.py` 包含了创建predictor，读取输入，预测，获取输出的等功能。
+
+运行：
+```
+python infer.py
+```
+
+分词结果为：
+
+```
+1 sample's result: <UNK>/n 电脑/vn 对/v-I 胎儿/v-I 影响/vn-B 大/v-I 吗/a
+2 sample's result: 这个/r 跟/p 我们/ns 一直/p 传承/n 《/p 易经/n 》/n 的/u 精神/n 是/v-I 分/v 不/d 开/v 的/u
+3 sample's result: 他们/p 不/r 但/ad 上/v-I 名医/v-I 门诊/n ,/w 还/n 兼/ns-I 作/ns-I 门诊/n 医生/v-I 的/n 顾问/v-I 团/nt
+4 sample's result: 负责/n 外商/v-I 投资/v-I 企业/n 和/v-I 外国/v-I 企业/n 的/u 税务/nr-I 登记/v-I ,/w 纳税/n 申报/vn 和/n 税收/vn 资料/n 的/u 管理/n ,/w 全面/c 掌握/n 税收/vn 信息/n
+5 sample's result: 采用/ns-I 弹性/ns-I 密封/ns-I 结构/n ,/w 实现/n 零/v-B 间隙/v-I
+6 sample's result: 要/r 做/n 好/p 这/n 三/p 件/vn 事/n ,/w 支行/q 从/q 风险/n 管理/p 到/a 市场/q 营销/n 策划/c 都/p 必须/vn 专业/n 到位/vn
+7 sample's result: 那么/nz-B ,/r 请/v-I 你/v-I 一定/nz-B 要/d-I 幸福/ad ./v-I
+8 sample's result: 叉车/ns-I 在/ns-I 企业/n 的/u 物流/n 系统/vn 中/ns-I 扮演/ns-I 着/v-I 非常/q 重要/n 的/u 角色/n ,/w 是/u 物料/vn 搬运/ns-I 设备/n 中/vn 的/u 主力/ns-I 军/v-I
+9 sample's result: 我/r 真/t 的/u 能够/vn 有/ns-I 机会/ns-I 拍摄/v-I 这部/vn 电视/ns-I 剧/v-I 么/vn
+10 sample's result: 这种/r 情况/n 应该/v-I 是/v-I 没/n 有/p 危害/n 的/u
+```
+
+### 相关链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/python/ELMo/infer.py b/doc/fluid/Paddle-Inference/python/ELMo/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e7ec1bb637c9b78db6c6e34be927c4d29bf0a48
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/ELMo/infer.py
@@ -0,0 +1,141 @@
+#coding: utf-8
+from __future__ import print_function
+import numpy as np
+import paddle
+import argparse
+import reader
+import sys
+
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+
+
+def parse_args():
+    """
+    Parsing the input parameters.
+    """
+    parser = argparse.ArgumentParser("Inference for lexical analyzer.")
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="elmo",
+        help="The folder where the test data is located.")
+    parser.add_argument(
+        "--testdata_dir",
+        type=str,
+        default="elmo_data/dev",
+        help="The folder where the test data is located.")
+    parser.add_argument(
+        "--use_gpu",
+        type=int,
+        default=False,
+        help="Whether or not to use GPU. 0-->CPU 1-->GPU")
+    parser.add_argument(
+        "--word_dict_path",
+        type=str,
+        default="elmo_data/vocabulary_min5k.txt",
+        help="The path of the word dictionary.")
+    parser.add_argument(
+        "--label_dict_path",
+        type=str,
+        default="elmo_data/tag.dic",
+        help="The path of the label dictionary.")
+    parser.add_argument(
+        "--word_rep_dict_path",
+        type=str,
+        default="elmo_data/q2b.dic",
+        help="The path of the word replacement Dictionary.")
+
+    args = parser.parse_args()
+    return args
+
+
+def to_lodtensor(data):
+    """
+    Convert data in list into lodtensor.
+    """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    return flattened_data, [lod]
+
+
+def create_predictor(args):
+    if args.model_dir is not "":
+        config = AnalysisConfig(args.model_dir)
+    else:
+        config = AnalysisConfig(args.model_file, args.params_file)
+
+    config.switch_use_feed_fetch_ops(False)
+    config.enable_memory_optim()
+    if args.use_gpu:
+        config.enable_use_gpu(1000, 0)
+    else:
+        # If not specific mkldnn, you can set the blas thread.
+        # The thread num should not be greater than the number of cores in the CPU.
+        config.set_cpu_math_library_num_threads(4)
+
+    predictor = create_paddle_predictor(config)
+    return predictor
+
+
+def run(predictor, datas, lods):
+    input_names = predictor.get_input_names()
+    for i, name in enumerate(input_names):
+        input_tensor = predictor.get_input_tensor(name)
+        input_tensor.reshape(datas[i].shape)
+        input_tensor.copy_from_cpu(datas[i].copy())
+        input_tensor.set_lod(lods[i])
+
+    # do the inference
+    predictor.zero_copy_run()
+
+    results = []
+    # get out data from output tensor
+    output_names = predictor.get_output_names()
+    for i, name in enumerate(output_names):
+        output_tensor = predictor.get_output_tensor(name)
+        output_data = output_tensor.copy_to_cpu()
+        results.append(output_data)
+    return results
+
+
+if __name__ == '__main__':
+
+    args = parse_args()
+    word2id_dict = reader.load_reverse_dict(args.word_dict_path)
+    label2id_dict = reader.load_reverse_dict(args.label_dict_path)
+    word_rep_dict = reader.load_dict(args.word_rep_dict_path)
+    word_dict_len = max(map(int, word2id_dict.values())) + 1
+    label_dict_len = max(map(int, label2id_dict.values())) + 1
+
+    pred = create_predictor(args)
+
+    test_data = paddle.batch(
+        reader.file_reader(args.testdata_dir, word2id_dict, label2id_dict,
+                           word_rep_dict),
+        batch_size=1)
+    batch_id = 0
+    id2word = {v: k for k, v in word2id_dict.items()}
+    id2label = {v: k for k, v in label2id_dict.items()}
+    for data in test_data():
+        batch_id += 1
+        word_data, word_lod = to_lodtensor(list(map(lambda x: x[0], data)))
+        target_data, target_lod = to_lodtensor(list(map(lambda x: x[1], data)))
+        result_list = run(pred, [word_data, target_data],
+                          [word_lod, target_lod])
+        number_infer = np.array(result_list[0])
+        number_label = np.array(result_list[1])
+        number_correct = np.array(result_list[2])
+        lac_result = ""
+        for i in range(len(data[0][0])):
+            lac_result += id2word[data[0][0][i]] + '/' + id2label[np.array(
+                result_list[3]).tolist()[i][0]] + " "
+        print("%d sample's result:" % batch_id, lac_result)
+        if batch_id >= 10:
+            exit()
diff --git a/doc/fluid/Paddle-Inference/python/ELMo/reader.py b/doc/fluid/Paddle-Inference/python/ELMo/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..73d1130b426be0a63d1309ad10413cd3f0d0bc78
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/ELMo/reader.py
@@ -0,0 +1,142 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#coding: utf-8
+"""
+The file_reader converts raw corpus to input.
+"""
+import os
+import __future__
+import io
+
+
+def file_reader(file_dir,
+                word2id_dict,
+                label2id_dict,
+                word_replace_dict,
+                filename_feature=""):
+    """
+    define the reader to read files in file_dir
+    """
+    word_dict_len = max(map(int, word2id_dict.values())) + 1
+    label_dict_len = max(map(int, label2id_dict.values())) + 1
+
+    def reader():
+        """
+        the data generator
+        """
+        index = 0
+        for root, dirs, files in os.walk(file_dir):
+            for filename in files:
+                for line in io.open(
+                        os.path.join(root, filename), 'r', encoding='utf8'):
+                    index += 1
+                    bad_line = False
+                    line = line.strip("\n")
+                    if len(line) == 0:
+                        continue
+                    seg_tag = line.rfind("\t")
+                    word_part = line[0:seg_tag].strip().split(' ')
+                    label_part = line[seg_tag + 1:]
+                    word_idx = []
+                    words = word_part
+                    for word in words:
+                        if word in word_replace_dict:
+                            word = word_replace_dict[word]
+                        if word in word2id_dict:
+                            word_idx.append(int(word2id_dict[word]))
+                        else:
+                            word_idx.append(int(word2id_dict["<UNK>"]))
+                    target_idx = []
+                    labels = label_part.strip().split(" ")
+                    for label in labels:
+                        if label in label2id_dict:
+                            target_idx.append(int(label2id_dict[label]))
+                        else:
+                            target_idx.append(int(label2id_dict["O"]))
+                    if len(word_idx) != len(target_idx):
+                        print(line)
+                        continue
+                    yield word_idx, target_idx
+
+    return reader
+
+
+def test_reader(file_dir,
+                word2id_dict,
+                label2id_dict,
+                word_replace_dict,
+                filename_feature=""):
+    """
+    define the reader to read test files in file_dir
+    """
+    word_dict_len = max(map(int, word2id_dict.values())) + 1
+    label_dict_len = max(map(int, label2id_dict.values())) + 1
+
+    def reader():
+        """
+        the data generator
+        """
+        index = 0
+        for root, dirs, files in os.walk(file_dir):
+            for filename in files:
+                if not filename.startswith(filename_feature):
+                    continue
+                for line in io.open(
+                        os.path.join(root, filename), 'r', encoding='utf8'):
+                    index += 1
+                    bad_line = False
+                    line = line.strip("\n")
+                    if len(line) == 0:
+                        continue
+                    seg_tag = line.rfind("\t")
+                    if seg_tag == -1:
+                        seg_tag = len(line)
+                    word_part = line[0:seg_tag]
+                    label_part = line[seg_tag + 1:]
+                    word_idx = []
+                    words = word_part
+                    for word in words:
+                        if ord(word) < 0x20:
+                            word = ' '
+                        if word in word_replace_dict:
+                            word = word_replace_dict[word]
+                        if word in word2id_dict:
+                            word_idx.append(int(word2id_dict[word]))
+                        else:
+                            word_idx.append(int(word2id_dict["OOV"]))
+                    yield word_idx, words
+
+    return reader
+
+
+def load_reverse_dict(dict_path):
+    """
+    Load a dict. The first column is the key and the second column is the value.
+    """
+    result_dict = {}
+    for idx, line in enumerate(io.open(dict_path, "r", encoding='utf8')):
+        terms = line.strip("\n")
+        result_dict[terms] = idx
+    return result_dict
+
+
+def load_dict(dict_path):
+    """
+    Load a dict. The first column is the value and the second column is the key.
+    """
+    result_dict = {}
+    for idx, line in enumerate(io.open(dict_path, "r", encoding='utf8')):
+        terms = line.strip("\n")
+        result_dict[idx] = terms
+    return result_dict
diff --git a/doc/fluid/Paddle-Inference/python/README.md b/doc/fluid/Paddle-Inference/python/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..127dc3d278601f5c428347ad813d2a663c79dd60
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/README.md
@@ -0,0 +1,21 @@
+# Python 预测样例
+
+**如果您看到这个目录，我们会假设您已经对Paddle Inference有了一定的了解。如果您刚刚接触Paddle Inference不久，建议您[访问这里](https://paddle-inference.readthedocs.io/en/latest/#)对Paddle Inference做一个初步的认识。**
+
+在这个目录下，我们为大家准备了图像中使用的分类，检测，以及NLP使用Ernie/Bert模型等Python测试样例。
+
+
+## 前提准备
+
+为了能够顺利运行样例，请您在环境中安装**1.7**或以上版本的Paddle，具体的安装方式请参照[飞桨官方页面](https://www.paddlepaddle.org.cn/)的指示方式。
+
+
+**验证是否安装成功：**
+
+1）Shell下进入python解释器  
+
+2） 输入`import paddle.fluid`，再输入`paddle.fluid.install_check.run_check()`
+
+如果出现`Your Paddle Fluid is installed successfully!`， 说明您已经成功安装。
+
+有了Paddle Python环境后，我们开始进入各个目录进行样例测试吧～
diff --git a/doc/fluid/Paddle-Inference/python/index.html b/doc/fluid/Paddle-Inference/python/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..c60eaab2183a34f3092c52c7c3de34a0eb811dc8
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/index.html
@@ -0,0 +1,85 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Python 预测样例
+
+**如果您看到这个目录，我们会假设您已经对Paddle Inference有了一定的了解。如果您刚刚接触Paddle Inference不久，建议您[访问这里](https://paddle-inference.readthedocs.io/en/latest/#)对Paddle Inference做一个初步的认识。**
+
+在这个目录下，我们为大家准备了图像中使用的分类，检测，以及NLP使用Ernie/Bert模型等Python测试样例。
+
+
+## 前提准备
+
+为了能够顺利运行样例，请您在环境中安装**1.7**或以上版本的Paddle，具体的安装方式请参照[飞桨官方页面](https://www.paddlepaddle.org.cn/)的指示方式。
+
+
+**验证是否安装成功：**
+
+1）Shell下进入python解释器  
+
+2） 输入`import paddle.fluid`，再输入`paddle.fluid.install_check.run_check()`
+
+如果出现`Your Paddle Fluid is installed successfully!`， 说明您已经成功安装。
+
+有了Paddle Python环境后，我们开始进入各个目录进行样例测试吧～
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/README.md b/doc/fluid/Paddle-Inference/python/mask_detection/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d8d2e24e1aaa06d9e308e75bb74d6c1de11e4486
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/mask_detection/README.md
@@ -0,0 +1,43 @@
+## 口罩检测
+
+
+在整个口罩检测任务中，我们会用到两个模型，一个是人脸检测模型，用来检测出图片中的所有的人脸；另外一个为人脸口罩分类模型，用来对人脸进行分类，判别该人脸是否戴有口罩。
+
+在本目录中，我们通过Paddle Inference Python 接口实现了口罩检测任务。
+
+### 运行
+
+
+**1） 下载模型**
+
+我们有两种方式下载模型：
+
+a. 通过脚本下载
+
+```
+cd models
+sh model_downloads.sh
+```
+
+b. 通过PaddleHub下载
+
+```
+# 下载paddlehub以后，通过python执行以下代码
+import paddlehub as hub
+pyramidbox_lite_mobile_mask = hub.Module(name="pyramidbox_lite_mobile_mask")
+# 将模型保存在models文件夹之中
+pyramidbox_lite_mobile_mask.processor.save_inference_model(dirname="models")
+# 通过以上命令，可以获得人脸检测和口罩佩戴判断模型，分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件，__param__文件是权重文件。
+```
+
+
+**2） 运行程序**
+
+```
+python cam_video.py
+```
+
+运行后，程序会启动机器上的摄像头并执行口罩检测流程，如果检测到有人脸不带口罩，程序会对该人脸进行红框标记，并显示到屏幕。
+
+
+![图片1](https://user-images.githubusercontent.com/5595332/81150234-266f4b00-8fb2-11ea-98e7-92909d9c6792.png)
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/assets/VCR_OSD_MONO_1.001.ttf b/doc/fluid/Paddle-Inference/python/mask_detection/assets/VCR_OSD_MONO_1.001.ttf
new file mode 100644
index 0000000000000000000000000000000000000000..dcca687a434d5c7b6a3027e65e0b7d8728b25c71
Binary files /dev/null and b/doc/fluid/Paddle-Inference/python/mask_detection/assets/VCR_OSD_MONO_1.001.ttf differ
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/assets/mask_test2.jpg b/doc/fluid/Paddle-Inference/python/mask_detection/assets/mask_test2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..69226d318c6f6033ed3e588bd02e8857bec91971
Binary files /dev/null and b/doc/fluid/Paddle-Inference/python/mask_detection/assets/mask_test2.jpg differ
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/assets/test_mask_detection.jpg b/doc/fluid/Paddle-Inference/python/mask_detection/assets/test_mask_detection.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..33c07b607d3dfcee0a27ab8f3c8a8fd9bf7dee80
Binary files /dev/null and b/doc/fluid/Paddle-Inference/python/mask_detection/assets/test_mask_detection.jpg differ
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/cam_video.py b/doc/fluid/Paddle-Inference/python/mask_detection/cam_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..701392987d44a919e9030a825b9915f81ced3a5c
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/mask_detection/cam_video.py
@@ -0,0 +1,18 @@
+# -*- coding: UTF-8 -*-
+
+import cv2
+from mask_detect import MaskPred
+
+# The MaskPred class implements the function of face mask detection,
+# including face detection and face mask classification
+mp = MaskPred(True, True, 0)
+# Turn on the first camera, 0 means device ID
+cap = cv2.VideoCapture(0)
+cv2.namedWindow('Mask Detect')
+
+while True:
+    ret, frame = cap.read()
+    if cv2.waitKey(10) == ord("q"):
+        break
+    result = mp.run(frame)
+    cv2.imshow("image", result['img'])
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/config.py b/doc/fluid/Paddle-Inference/python/mask_detection/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8fcaeef85fc1c07ba88429e63e21261b5b68fe6
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/mask_detection/config.py
@@ -0,0 +1,31 @@
+# -*- coding: UTF-8 -*-
+import os
+# current dir 
+PREDICT_FILE_PATH = os.path.split(os.path.realpath(__file__))[0]
+
+# face detect model file dir
+DETECT_MODEL_FILE = os.path.join(PREDICT_FILE_PATH,
+                                 "models/pyramidbox_lite/model")
+# face detect params file dir
+DETECT_MODEL_PARAM = os.path.join(PREDICT_FILE_PATH,
+                                  "models/pyramidbox_lite/params")
+# face mask classify model file dir
+MASK_MODEL_FILE = os.path.join(PREDICT_FILE_PATH, "models/mask_detector/model")
+# face mask classify params file dir
+MASK_MODEL_PARAM = os.path.join(PREDICT_FILE_PATH,
+                                "models/mask_detector/params")
+
+# face detect threadhold 
+# The face detect model's output is like [a, x1, x2, y1, y2].
+# Among them, a represents the confidence of the face. If a > FACE_THRES, means that the area corresponding to the output is a face
+FACE_THREAS = 0.6
+
+# Face mask classification threshold
+# If the classification result is greater than this threshold, it means that the face is wearing a mask
+MASK_THREAS = 0.6
+
+# Before the face detect infernece, the input will be resized to a certain size based on DETECT_INPUT_SHRINK 
+DETECT_INPUT_SHRINK = 0.3
+
+FACE_BOX_LINE_WIDTH = 8
+TIME_TEXT_SIZE = 50
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/index.html b/doc/fluid/Paddle-Inference/python/mask_detection/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..9855d1c118deeab3f03e2a02693fecbb1eec2299
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/mask_detection/index.html
@@ -0,0 +1,107 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+## 口罩检测
+
+
+在整个口罩检测任务中，我们会用到两个模型，一个是人脸检测模型，用来检测出图片中的所有的人脸；另外一个为人脸口罩分类模型，用来对人脸进行分类，判别该人脸是否戴有口罩。
+
+在本目录中，我们通过Paddle Inference Python 接口实现了口罩检测任务。
+
+### 运行
+
+
+**1） 下载模型**
+
+我们有两种方式下载模型：
+
+a. 通过脚本下载
+
+```
+cd models
+sh model_downloads.sh
+```
+
+b. 通过PaddleHub下载
+
+```
+# 下载paddlehub以后，通过python执行以下代码
+import paddlehub as hub
+pyramidbox_lite_mobile_mask = hub.Module(name="pyramidbox_lite_mobile_mask")
+# 将模型保存在models文件夹之中
+pyramidbox_lite_mobile_mask.processor.save_inference_model(dirname="models")
+# 通过以上命令，可以获得人脸检测和口罩佩戴判断模型，分别存储在pyramidbox_lite和mask_detector之中。文件夹中的__model__是模型结构文件，__param__文件是权重文件。
+```
+
+
+**2） 运行程序**
+
+```
+python cam_video.py
+```
+
+运行后，程序会启动机器上的摄像头并执行口罩检测流程，如果检测到有人脸不带口罩，程序会对该人脸进行红框标记，并显示到屏幕。
+
+
+![图片1](https://user-images.githubusercontent.com/5595332/81150234-266f4b00-8fb2-11ea-98e7-92909d9c6792.png)
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/mask_detect.py b/doc/fluid/Paddle-Inference/python/mask_detection/mask_detect.py
new file mode 100644
index 0000000000000000000000000000000000000000..75f1707dad916248c89ebead25d550bfc36ff816
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/mask_detection/mask_detect.py
@@ -0,0 +1,121 @@
+import cv2, os, sys
+import numpy as np
+from models.pd_model import Model
+from models.preprocess import face_detect_preprocess, mask_classify_preprocess
+from PIL import Image
+from PIL import ImageDraw, ImageFont
+import datetime
+from config import *
+
+
+class MaskPred:
+    def __init__(self, use_mkldnn=True, use_gpu=False, device_id=0):
+        # face detector
+        self.face_detector = Model(DETECT_MODEL_FILE, DETECT_MODEL_PARAM,
+                                   use_mkldnn, use_gpu, device_id)
+        self.face_threas = FACE_THREAS
+        # face mask classify 
+        self.mask_classify = Model(MASK_MODEL_FILE, MASK_MODEL_PARAM,
+                                   use_mkldnn, use_gpu, device_id)
+        self.mask_threas = MASK_THREAS
+        self.index = 0
+
+    def get_faces(self, data, h, w):
+        faces_loc = []
+        for d in data:
+            if d[1] >= self.face_threas:
+                x_min = max(d[2] * w, 0)
+                y_min = max(d[3] * h, 0)
+                x_h = min((d[4] - d[2]) * w, w)
+                y_w = min((d[5] - d[3]) * h, h)
+                faces_loc.append([int(x_min), int(y_min), int(x_h), int(y_w)])
+        return faces_loc
+
+    def draw_boxes(self, img, boxes):
+        h, w, _ = img.shape
+        image = Image.fromarray(img)
+        draw = ImageDraw.Draw(image)
+        CUR_FILE_PATH = os.path.split(os.path.realpath(__file__))[0]
+        for box in boxes:
+            x_min = box[0]
+            y_min = box[1]
+            x_max = box[0] + box[2]
+            y_max = box[1] + box[3]
+            (left, right, top, bottom) = (x_min, x_max, y_min, y_max)
+            color = "red"
+            if box[4] < self.mask_threas:
+                color = "blue"
+            draw.line(
+                [(left - 10, top - 10), (left - 10, bottom + 10),
+                 (right + 10, bottom + 10), (right + 10, top - 10),
+                 (left - 10, top - 10)],
+                width=FACE_BOX_LINE_WIDTH,
+                fill=color)
+            conf_text = str(box[4])
+
+            draw.text(
+                [left, top - 50],
+                conf_text,
+                font=ImageFont.truetype(
+                    os.path.join(CUR_FILE_PATH,
+                                 "assets/VCR_OSD_MONO_1.001.ttf"),
+                    size=30),
+                fill="#ff0000")
+            cur = datetime.datetime.now()
+            cur = str(cur)
+            draw.text(
+                [10, 10],
+                cur,
+                font=ImageFont.truetype(
+                    os.path.join(CUR_FILE_PATH,
+                                 "assets/VCR_OSD_MONO_1.001.ttf"),
+                    size=TIME_TEXT_SIZE),
+                fill="#ff0000")
+        img = np.asarray(image)
+        return img
+
+    # do face detect and mask classify
+    def run(self, img):
+        h, w, c = img.shape
+        img_t = face_detect_preprocess(img, DETECT_INPUT_SHRINK)
+        results = self.face_detector.run([img_t])
+        faces = self.get_faces(results[0], h, w)
+        faces_mask_loc_conf = []
+        all_with_mask = True
+        for loc in faces:
+            # (x_min, y_min), (x_max, y_min), (x_min, y_max), (x_max, y_max)
+            pts = np.array([
+                loc[0], loc[1], loc[2] + loc[0], loc[1], loc[0],
+                loc[1] + loc[3], loc[2] + loc[0], loc[1] + loc[3]
+            ]).reshape(4, 2).astype(np.float32)
+            face_img_t, temp_face = mask_classify_preprocess(img, pts)
+            mask_results = self.mask_classify.run([face_img_t])
+            mask_conf = mask_results[0]
+            temp_loc = loc
+            if (mask_conf[0][1] < self.mask_threas):
+                all_with_mask = False
+            temp_loc.append(mask_conf[0][1])
+            faces_mask_loc_conf.append(temp_loc)
+
+        result_dict = {
+            "all_with_mask": all_with_mask,
+            "loc_conf": faces_mask_loc_conf
+        }
+        result_dict['face_num'] = len(faces_mask_loc_conf)
+        img = self.draw_boxes(img, faces_mask_loc_conf)
+        result_dict['img'] = img
+        return result_dict
+
+
+if __name__ == "__main__":
+    mp = MaskPred(True, True, 0)
+    img = cv2.imread("./assets/test_mask_detection.jpg")
+    result = mp.run(img)
+    print(result['loc_conf'])
+
+    if not result["all_with_mask"]:
+        result_img = result['img']
+        h, w, _ = result_img.shape
+        result_img = cv2.resize(result_img, (int(w * 0.6), int(h * 0.6)))
+        cv2.imshow("image", result_img)
+        cv2.waitKey(0)
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/models/__init__.py b/doc/fluid/Paddle-Inference/python/mask_detection/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/models/model_downloads.sh b/doc/fluid/Paddle-Inference/python/mask_detection/models/model_downloads.sh
new file mode 100644
index 0000000000000000000000000000000000000000..20c2d295acab0f6afe691250d086c0577dba6e1e
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/mask_detection/models/model_downloads.sh
@@ -0,0 +1,10 @@
+# download the face detect model
+wget https://paddle-inference-dist.cdn.bcebos.com/inference_demo/pyramidbox_lite.tar.gz
+
+# download the mask classification model
+wget https://paddle-inference-dist.cdn.bcebos.com/inference_demo/mask_detector.tar.gz
+
+# unzip 
+tar -zxvf pyramidbox_lite.tar.gz
+tar -zxvf mask_detector.tar.gz
+
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/models/pd_model.py b/doc/fluid/Paddle-Inference/python/mask_detection/models/pd_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d14be1232e818455e223841f304cddc8849bfd36
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/mask_detection/models/pd_model.py
@@ -0,0 +1,44 @@
+import numpy as np
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+
+
+class Model:
+    def __init__(self,
+                 model_file,
+                 params_file,
+                 use_mkldnn=True,
+                 use_gpu=False,
+                 device_id=0):
+        config = AnalysisConfig(model_file, params_file)
+        config.switch_use_feed_fetch_ops(False)
+        config.switch_specify_input_names(True)
+        config.enable_memory_optim()
+
+        if use_gpu:
+            print("ENABLE_GPU")
+            config.enable_use_gpu(100, device_id)
+
+        if use_mkldnn:
+            config.enable_mkldnn()
+        self.predictor = create_paddle_predictor(config)
+
+    def run(self, img_list):
+
+        input_names = self.predictor.get_input_names()
+        for i, name in enumerate(input_names):
+            input_tensor = self.predictor.get_input_tensor(input_names[i])
+            input_tensor.reshape(img_list[i].shape)
+            input_tensor.copy_from_cpu(img_list[i].copy())
+
+        self.predictor.zero_copy_run()
+
+        results = []
+        output_names = self.predictor.get_output_names()
+
+        for i, name in enumerate(output_names):
+            output_tensor = self.predictor.get_output_tensor(output_names[i])
+            output_data = output_tensor.copy_to_cpu()
+            results.append(output_data)
+
+        return results
diff --git a/doc/fluid/Paddle-Inference/python/mask_detection/models/preprocess.py b/doc/fluid/Paddle-Inference/python/mask_detection/models/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..eca33eda1f081d88685886975a29865aa4465d10
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/mask_detection/models/preprocess.py
@@ -0,0 +1,90 @@
+import cv2
+import numpy as np
+from PIL import Image
+import math
+
+FACE_H = 128
+FACE_W = 128
+
+
+def face_detect_preprocess(img, shrink=1.0):
+    # BGR  
+    img_shape = img.shape
+    img = cv2.resize(
+        img, (int(img_shape[1] * shrink), int(img_shape[0] * shrink)),
+        interpolation=cv2.INTER_CUBIC)
+
+    # HWC -> CHW
+    img = np.swapaxes(img, 1, 2)
+    img = np.swapaxes(img, 1, 0)
+
+    # RBG to BGR
+    mean = [104., 117., 123.]
+    scale = 0.007843
+    img = img.astype('float32')
+    img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
+    img = img * scale
+    img = img[np.newaxis, :]
+    return img
+
+
+index = 0
+
+
+def mask_classify_preprocess(img, pts):
+    # BGR  
+    img_face, _ = crop(img, pts)
+    t_img_face = img_face.copy()
+    #   global index
+    #   index += 1
+    #   cv2.imwrite(str(index)+ ".jpg", img_face)
+    img_face = img_face / 256.
+    # HWC -> CHW
+    img_face = np.swapaxes(img_face, 1, 2)
+    img_face = np.swapaxes(img_face, 1, 0)
+
+    # RBG to BGR
+    mean = [0.5, 0.5, 0.5]
+    img_face = img_face.astype('float32')
+    img_face -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
+    img_face = img_face.reshape(-1, 3, FACE_H, FACE_W)
+    return img_face, t_img_face
+
+
+#def crop(image, pts, shift=0, scale=1.38, rotate=0, res_width=128, res_height=128):
+def crop(image,
+         pts,
+         shift=0,
+         scale=1.5,
+         rotate=0,
+         res_width=FACE_W,
+         res_height=FACE_H):
+    res = (res_width, res_height)
+    idx1 = 0
+    idx2 = 1
+    # angle
+    alpha = 0
+    if pts[idx2, 0] != -1 and pts[idx2, 1] != -1 and pts[
+            idx1, 0] != -1 and pts[idx1, 1] != -1:
+        alpha = math.atan2(pts[idx2, 1] - pts[idx1, 1],
+                           pts[idx2, 0] - pts[idx1, 0]) * 180 / math.pi
+    pts[pts == -1] = np.inf
+    coord_min = np.min(pts, 0)
+    pts[pts == np.inf] = -1
+    coord_max = np.max(pts, 0)
+    # coordinates of center point
+    c = np.array([
+        coord_max[0] - (coord_max[0] - coord_min[0]) / 2,
+        coord_max[1] - (coord_max[1] - coord_min[1]) / 2
+    ])  # center
+    max_wh = max((coord_max[0] - coord_min[0]) / 2,
+                 (coord_max[1] - coord_min[1]) / 2)
+    # Shift the center point, rot add eyes angle
+    c = c + shift * max_wh
+    rotate = rotate + alpha
+    M = cv2.getRotationMatrix2D((c[0], c[1]), rotate,
+                                res[0] / (2 * max_wh * scale))
+    M[0, 2] = M[0, 2] - (c[0] - res[0] / 2.0)
+    M[1, 2] = M[1, 2] - (c[1] - res[0] / 2.0)
+    image_out = cv2.warpAffine(image, M, res)
+    return image_out, M
diff --git a/doc/fluid/Paddle-Inference/python/paddle_trt/infer_trt_ernie.py b/doc/fluid/Paddle-Inference/python/paddle_trt/infer_trt_ernie.py
new file mode 100644
index 0000000000000000000000000000000000000000..84760dec6bd7b2f1b45f500d123ed775bb5ad683
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/paddle_trt/infer_trt_ernie.py
@@ -0,0 +1,109 @@
+import numpy as np
+import argparse
+import cv2
+
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+
+
+# this is a simple resnet block for dynamci test.
+def create_predictor(args):
+    config = AnalysisConfig('./model')
+    config.switch_use_feed_fetch_ops(False)
+    config.enable_memory_optim()
+    config.enable_use_gpu(100, 0)
+
+    # using dynamic shpae mode, the max_batch_size will be ignored.
+    config.enable_tensorrt_engine(
+        workspace_size=1 << 30,
+        max_batch_size=1,
+        min_subgraph_size=5,
+        precision_mode=AnalysisConfig.Precision.Float32,
+        use_static=False,
+        use_calib_mode=False)
+
+    head_number = 12
+
+    names = [
+        "placeholder_0", "placeholder_1", "placeholder_2", "stack_0.tmp_0"
+    ]
+    min_input_shape = [1, 1, 1]
+    max_input_shape = [100, 128, 1]
+    opt_input_shape = [10, 60, 1]
+
+    config.set_trt_dynamic_shape_info({
+        names[0]: min_input_shape,
+        names[1]: min_input_shape,
+        names[2]: min_input_shape,
+        names[3]: [1, head_number, 1, 1]
+    }, {
+        names[0]: max_input_shape,
+        names[1]: max_input_shape,
+        names[2]: max_input_shape,
+        names[3]: [100, head_number, 128, 128]
+    }, {
+        names[0]: opt_input_shape,
+        names[1]: opt_input_shape,
+        names[2]: opt_input_shape,
+        names[3]: [10, head_number, 60, 60]
+    })
+    predictor = create_paddle_predictor(config)
+    return predictor
+
+
+def run(predictor, data):
+    # copy data to input tensor
+    input_names = predictor.get_input_names()
+    for i, name in enumerate(input_names):
+        input_tensor = predictor.get_input_tensor(name)
+        input_tensor.reshape(data[i].shape)
+        input_tensor.copy_from_cpu(data[i].copy())
+
+    # do the inference
+    predictor.zero_copy_run()
+
+    results = []
+    # get out data from output tensor
+    output_names = predictor.get_output_names()
+    for i, name in enumerate(output_names):
+        output_tensor = predictor.get_output_tensor(name)
+        output_data = output_tensor.copy_to_cpu()
+        results.append(output_data)
+
+    return results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_file",
+        type=str,
+        default="",
+        help="Model filename, Specify this when your model is a combined model."
+    )
+    parser.add_argument(
+        "--params_file",
+        type=str,
+        default="",
+        help="Parameter filename, Specify this when your model is a combined model."
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="",
+        help="Model dir, If you load a non-combined model, specify the directory of the model."
+    )
+    parser.add_argument(
+        "--use_gpu", type=int, default=0, help="Whether use gpu.")
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    pred = create_predictor(args)
+    in1 = np.ones((1, 128, 1)).astype(np.int64)
+    in2 = np.ones((1, 128, 1)).astype(np.int64)
+    in3 = np.ones((1, 128, 1)).astype(np.int64)
+    in4 = np.ones((1, 128, 1)).astype(np.float32)
+    result = run(pred, [in1, in2, in3, in4])
+    print(result)
diff --git a/doc/fluid/Paddle-Inference/python/resnet50/README.md b/doc/fluid/Paddle-Inference/python/resnet50/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..25b83fa1298ec027b62f72aa95ba40e973311503
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/resnet50/README.md
@@ -0,0 +1,44 @@
+## 运行Resnet50图像分类样例
+
+
+### 一：准备环境
+
+请您在环境中安装1.7或以上版本的Paddle，具体的安装方式请参照[飞桨官方页面](https://www.paddlepaddle.org.cn/)的指示方式。
+
+
+### 二：下载模型以及测试数据
+
+
+1）**获取预测模型**
+
+下载[模型](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50.tar.gz)，模型为imagenet 数据集训练得到的，如果你想获取更多的模型训练信息，请访问[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification)。
+
+
+2）**获取预测样例图片**
+
+下载[样例图片](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ILSVRC2012_val_00000247.jpeg)。
+
+图片如下：
+<p align="left">
+    <br>
+<img src='https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ILSVRC2012_val_00000247.jpeg' width = "200" height = "200">
+    <br>
+<p>
+
+
+### 三：运行预测
+
+文件`img_preprocess.py`包含了图像的预处理。  
+文件`infer_resnet.py` 包含了创建predictor，读取示例图片，预测，获取输出的等功能。
+
+运行：
+```
+python infer_resnet.py --model_file=./ResNet50/model --params_file=./ResNet50/params --use_gpu=1
+```
+
+运行的结果为： ('class index: ', 13)。
+13表示图片的类别。我们通过imagenet [类别映射表](https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a)， 可以找到对应的类别，即junco, snowbird，由此说明我们的分类器分类正确。
+
+### 相关链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
diff --git a/doc/fluid/Paddle-Inference/python/resnet50/img_preprocess.py b/doc/fluid/Paddle-Inference/python/resnet50/img_preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..34321de022caac691a7be414ead89b05bff7ffa3
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/resnet50/img_preprocess.py
@@ -0,0 +1,41 @@
+import cv2
+import numpy as np
+
+
+def resize_short(img, target_size):
+    """ resize_short """
+    percent = float(target_size) / min(img.shape[0], img.shape[1])
+    resized_width = int(round(img.shape[1] * percent))
+    resized_height = int(round(img.shape[0] * percent))
+    resized = cv2.resize(img, (resized_width, resized_height))
+    return resized
+
+
+def crop_image(img, target_size, center):
+    """ crop_image """
+    height, width = img.shape[:2]
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img[int(h_start):int(h_end), int(w_start):int(w_end), :]
+    return img
+
+
+def preprocess(img):
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    img = resize_short(img, 224)
+    img = crop_image(img, 224, True)
+    # bgr-> rgb && hwc->chw
+    img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255
+    img_mean = np.array(mean).reshape((3, 1, 1))
+    img_std = np.array(std).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    return img[np.newaxis, :]
diff --git a/doc/fluid/Paddle-Inference/python/resnet50/index.html b/doc/fluid/Paddle-Inference/python/resnet50/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..009f4a68d7aebbd52e144f24d8793801d265f38c
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/resnet50/index.html
@@ -0,0 +1,108 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+## 运行Resnet50图像分类样例
+
+
+### 一：准备环境
+
+请您在环境中安装1.7或以上版本的Paddle，具体的安装方式请参照[飞桨官方页面](https://www.paddlepaddle.org.cn/)的指示方式。
+
+
+### 二：下载模型以及测试数据
+
+
+1）**获取预测模型**
+
+下载[模型](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ResNet50.tar.gz)，模型为imagenet 数据集训练得到的，如果你想获取更多的模型训练信息，请访问[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification)。
+
+
+2）**获取预测样例图片**
+
+下载[样例图片](https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ILSVRC2012_val_00000247.jpeg)。
+
+图片如下：
+<p align="left">
+    <br>
+<img src='https://paddle-inference-dist.bj.bcebos.com/inference_demo/python/resnet50/ILSVRC2012_val_00000247.jpeg' width = "200" height = "200">
+    <br>
+<p>
+
+
+### 三：运行预测
+
+文件`img_preprocess.py`包含了图像的预处理。  
+文件`infer_resnet.py` 包含了创建predictor，读取示例图片，预测，获取输出的等功能。
+
+运行：
+```
+python infer_resnet.py --model_file=./ResNet50/model --params_file=./ResNet50/params --use_gpu=1
+```
+
+运行的结果为： ('class index: ', 13)。
+13表示图片的类别。我们通过imagenet [类别映射表](https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a)， 可以找到对应的类别，即junco, snowbird，由此说明我们的分类器分类正确。
+
+### 相关链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/python/resnet50/infer_resnet.py b/doc/fluid/Paddle-Inference/python/resnet50/infer_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab6578bee0c57402838cda54870a02aba1b4b861
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/resnet50/infer_resnet.py
@@ -0,0 +1,85 @@
+import numpy as np
+import argparse
+import cv2
+
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+
+from img_preprocess import preprocess
+
+
+def create_predictor(args):
+    if args.model_dir is not "":
+        config = AnalysisConfig(args.model_dir)
+    else:
+        config = AnalysisConfig(args.model_file, args.params_file)
+
+    config.switch_use_feed_fetch_ops(False)
+    config.enable_memory_optim()
+    if args.use_gpu:
+        config.enable_use_gpu(1000, 0)
+    else:
+        # If not specific mkldnn, you can set the blas thread.
+        # The thread num should not be greater than the number of cores in the CPU.
+        config.set_cpu_math_library_num_threads(4)
+    #config.enable_mkldnn()
+
+    predictor = create_paddle_predictor(config)
+    return predictor
+
+
+def run(predictor, img):
+    # copy img data to input tensor
+    input_names = predictor.get_input_names()
+    for i, name in enumerate(input_names):
+        input_tensor = predictor.get_input_tensor(name)
+        input_tensor.reshape(img[i].shape)
+        input_tensor.copy_from_cpu(img[i].copy())
+
+    # do the inference
+    predictor.zero_copy_run()
+
+    results = []
+    # get out data from output tensor
+    output_names = predictor.get_output_names()
+    for i, name in enumerate(output_names):
+        output_tensor = predictor.get_output_tensor(name)
+        output_data = output_tensor.copy_to_cpu()
+        results.append(output_data)
+
+    return results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_file",
+        type=str,
+        default="",
+        help="Model filename, Specify this when your model is a combined model."
+    )
+    parser.add_argument(
+        "--params_file",
+        type=str,
+        default="",
+        help="Parameter filename, Specify this when your model is a combined model."
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="",
+        help="Model dir, If you load a non-combined model, specify the directory of the model."
+    )
+    parser.add_argument(
+        "--use_gpu", type=int, default=0, help="Whether use gpu.")
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    pred = create_predictor(args)
+    img = cv2.imread('./ILSVRC2012_val_00000247.jpeg')
+    img = preprocess(img)
+    #img = np.ones((1, 3, 224, 224)).astype(np.float32)
+    result = run(pred, [img])
+    print("class index: ", np.argmax(result[0][0]))
diff --git a/doc/fluid/Paddle-Inference/python/yolov3/README.md b/doc/fluid/Paddle-Inference/python/yolov3/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8266c9481d1188c554438ef9e93068042f5200ef
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/yolov3/README.md
@@ -0,0 +1,69 @@
+## 运行YOLOv3图像检测样例
+
+
+### 一：准备环境
+
+请您在环境中安装1.7或以上版本的Paddle，具体的安装方式请参照[飞桨官方页面](https://www.paddlepaddle.org.cn/)的指示方式。
+
+
+### 二：下载模型以及测试数据
+
+
+1）**获取预测模型**
+
+点击[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz)下载模型， 该模型在imagenet数据集训练得到的，如果你想获取更多的**模型训练信息**，请访问[这里](https://github.com/PaddlePaddle/PaddleDetection)。
+
+
+2）**获取预测样例图片**
+
+下载[样例图片](https://paddle-inference-dist.bj.bcebos.com/inference_demo/images/kite.jpg)。
+
+图片如下：
+<p align="left">
+    <br>
+<img src='https://paddle-inference-dist.bj.bcebos.com/inference_demo/images/kite.jpg' width = "200" height = "200">
+    <br>
+<p>
+
+
+### 三：运行预测
+
+文件`utils.py`包含了图像的预处理等帮助函数。
+文件`infer_yolov3.py` 包含了创建predictor，读取示例图片，预测，获取输出的等功能。
+
+运行：
+```
+python infer_yolov3.py --model_file=./yolov3_infer/__model__ --params_file=./yolov3_infer/__params__ --use_gpu=1
+```
+
+输出结果如下所示：
+
+```
+category id is 0.0, bbox is [ 98.47467 471.34283 120.73273 578.5184 ]
+category id is 0.0, bbox is [ 51.752716 415.51324   73.18762  515.24005 ]
+category id is 0.0, bbox is [ 37.176304 343.378     46.64221  380.92963 ]
+category id is 0.0, bbox is [155.78638 328.0806  159.5393  339.37192]
+category id is 0.0, bbox is [233.86328 339.96912 239.35403 355.3322 ]
+category id is 0.0, bbox is [ 16.212902 344.42365   25.193722 377.97137 ]
+category id is 0.0, bbox is [ 10.583471 356.67862   14.9261   372.8137  ]
+category id is 0.0, bbox is [ 79.76479 364.19492  86.07656 385.64255]
+category id is 0.0, bbox is [312.8938  311.9908  314.58527 316.60056]
+category id is 33.0, bbox is [266.97925   51.70044  299.45105   99.996414]
+category id is 33.0, bbox is [210.45593 229.92128 217.77551 240.97136]
+category id is 33.0, bbox is [125.36278 159.80171 135.49306 189.8976 ]
+category id is 33.0, bbox is [486.9354  266.164   494.4437  283.84637]
+category id is 33.0, bbox is [259.01584 232.23044 270.69266 248.58704]
+category id is 33.0, bbox is [135.60567 254.57668 144.96178 276.9275 ]
+category id is 33.0, bbox is [341.91315 255.44394 345.0335  262.3398 ]
+```
+
+<p align="left">
+    <br>
+<img src='https://paddle-inference-dist.bj.bcebos.com/inference_demo/images/kite_res.jpg' width = "200" height = "200">
+    <br>
+<p>
+
+
+### 相关链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
diff --git a/doc/fluid/Paddle-Inference/python/yolov3/index.html b/doc/fluid/Paddle-Inference/python/yolov3/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..c272e07eddd69249a1d1612bf5aaa4e2264b3fb3
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/yolov3/index.html
@@ -0,0 +1,133 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+## 运行YOLOv3图像检测样例
+
+
+### 一：准备环境
+
+请您在环境中安装1.7或以上版本的Paddle，具体的安装方式请参照[飞桨官方页面](https://www.paddlepaddle.org.cn/)的指示方式。
+
+
+### 二：下载模型以及测试数据
+
+
+1）**获取预测模型**
+
+点击[链接](https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz)下载模型， 该模型在imagenet数据集训练得到的，如果你想获取更多的**模型训练信息**，请访问[这里](https://github.com/PaddlePaddle/PaddleDetection)。
+
+
+2）**获取预测样例图片**
+
+下载[样例图片](https://paddle-inference-dist.bj.bcebos.com/inference_demo/images/kite.jpg)。
+
+图片如下：
+<p align="left">
+    <br>
+<img src='https://paddle-inference-dist.bj.bcebos.com/inference_demo/images/kite.jpg' width = "200" height = "200">
+    <br>
+<p>
+
+
+### 三：运行预测
+
+文件`utils.py`包含了图像的预处理等帮助函数。
+文件`infer_yolov3.py` 包含了创建predictor，读取示例图片，预测，获取输出的等功能。
+
+运行：
+```
+python infer_yolov3.py --model_file=./yolov3_infer/__model__ --params_file=./yolov3_infer/__params__ --use_gpu=1
+```
+
+输出结果如下所示：
+
+```
+category id is 0.0, bbox is [ 98.47467 471.34283 120.73273 578.5184 ]
+category id is 0.0, bbox is [ 51.752716 415.51324   73.18762  515.24005 ]
+category id is 0.0, bbox is [ 37.176304 343.378     46.64221  380.92963 ]
+category id is 0.0, bbox is [155.78638 328.0806  159.5393  339.37192]
+category id is 0.0, bbox is [233.86328 339.96912 239.35403 355.3322 ]
+category id is 0.0, bbox is [ 16.212902 344.42365   25.193722 377.97137 ]
+category id is 0.0, bbox is [ 10.583471 356.67862   14.9261   372.8137  ]
+category id is 0.0, bbox is [ 79.76479 364.19492  86.07656 385.64255]
+category id is 0.0, bbox is [312.8938  311.9908  314.58527 316.60056]
+category id is 33.0, bbox is [266.97925   51.70044  299.45105   99.996414]
+category id is 33.0, bbox is [210.45593 229.92128 217.77551 240.97136]
+category id is 33.0, bbox is [125.36278 159.80171 135.49306 189.8976 ]
+category id is 33.0, bbox is [486.9354  266.164   494.4437  283.84637]
+category id is 33.0, bbox is [259.01584 232.23044 270.69266 248.58704]
+category id is 33.0, bbox is [135.60567 254.57668 144.96178 276.9275 ]
+category id is 33.0, bbox is [341.91315 255.44394 345.0335  262.3398 ]
+```
+
+<p align="left">
+    <br>
+<img src='https://paddle-inference-dist.bj.bcebos.com/inference_demo/images/kite_res.jpg' width = "200" height = "200">
+    <br>
+<p>
+
+
+### 相关链接
+- [Paddle Inference使用Quick Start！]()
+- [Paddle Inference Python Api使用]()
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/doc/fluid/Paddle-Inference/python/yolov3/infer_yolov3.py b/doc/fluid/Paddle-Inference/python/yolov3/infer_yolov3.py
new file mode 100644
index 0000000000000000000000000000000000000000..8090e471a73462d06e4179150e043e057f115362
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/yolov3/infer_yolov3.py
@@ -0,0 +1,89 @@
+import numpy as np
+import argparse
+import cv2
+from PIL import Image
+
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+
+from utils import preprocess, draw_bbox
+
+
+def create_predictor(args):
+    if args.model_dir is not "":
+        config = AnalysisConfig(args.model_dir)
+    else:
+        config = AnalysisConfig(args.model_file, args.params_file)
+
+    config.switch_use_feed_fetch_ops(False)
+    config.enable_memory_optim()
+    if args.use_gpu:
+        config.enable_use_gpu(1000, 0)
+    else:
+        # If not specific mkldnn, you can set the blas thread.
+        # The thread num should not be greater than the number of cores in the CPU.
+        config.set_cpu_math_library_num_threads(4)
+        #config.enable_mkldnn()
+
+    predictor = create_paddle_predictor(config)
+    return predictor
+
+
+def run(predictor, img):
+    # copy img data to input tensor
+    input_names = predictor.get_input_names()
+    for i, name in enumerate(input_names):
+        input_tensor = predictor.get_input_tensor(name)
+        input_tensor.reshape(img[i].shape)
+        input_tensor.copy_from_cpu(img[i].copy())
+
+    # do the inference
+    predictor.zero_copy_run()
+
+    results = []
+    # get out data from output tensor
+    output_names = predictor.get_output_names()
+    for i, name in enumerate(output_names):
+        output_tensor = predictor.get_output_tensor(name)
+        output_data = output_tensor.copy_to_cpu()
+        results.append(output_data)
+    return results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_file",
+        type=str,
+        default="",
+        help="Model filename, Specify this when your model is a combined model."
+    )
+    parser.add_argument(
+        "--params_file",
+        type=str,
+        default="",
+        help="Parameter filename, Specify this when your model is a combined model."
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="",
+        help="Model dir, If you load a non-combined model, specify the directory of the model."
+    )
+    parser.add_argument(
+        "--use_gpu", type=int, default=0, help="Whether use gpu.")
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    img_name = 'kite.jpg'
+    save_img_name = 'res.jpg'
+    im_size = 608
+    pred = create_predictor(args)
+    img = cv2.imread(img_name)
+    data = preprocess(img, im_size)
+    im_shape = np.array([im_size, im_size]).reshape((1, 2)).astype(np.int32)
+    result = run(pred, [data, im_shape])
+    img = Image.open(img_name).convert('RGB').resize((im_size, im_size))
+    draw_bbox(img, result[0], save_name=save_img_name)
diff --git a/doc/fluid/Paddle-Inference/python/yolov3/utils.py b/doc/fluid/Paddle-Inference/python/yolov3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a60266e56fac17deb26bda3609882d900a17e70
--- /dev/null
+++ b/doc/fluid/Paddle-Inference/python/yolov3/utils.py
@@ -0,0 +1,52 @@
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw
+
+
+def resize(img, target_size):
+    """resize to target size"""
+    if not isinstance(img, np.ndarray):
+        raise TypeError('image type is not numpy.')
+    im_shape = img.shape
+    im_size_min = np.min(im_shape[0:2])
+    im_size_max = np.max(im_shape[0:2])
+    im_scale_x = float(target_size) / float(im_shape[1])
+    im_scale_y = float(target_size) / float(im_shape[0])
+    img = cv2.resize(img, None, None, fx=im_scale_x, fy=im_scale_y)
+    return img
+
+
+def normalize(img, mean, std):
+    img = img / 255.0
+    mean = np.array(mean)[np.newaxis, np.newaxis, :]
+    std = np.array(std)[np.newaxis, np.newaxis, :]
+    img -= mean
+    img /= std
+    return img
+
+
+def preprocess(img, img_size):
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    img = resize(img, img_size)
+    img = img[:, :, ::-1].astype('float32')  # bgr -> rgb
+    img = normalize(img, mean, std)
+    img = img.transpose((2, 0, 1))  # hwc -> chw
+    return img[np.newaxis, :]
+
+
+def draw_bbox(img, result, threshold=0.5, save_name='res.jpg'):
+    """draw bbox"""
+    draw = ImageDraw.Draw(img)
+    for res in result:
+        cat_id, score, bbox = res[0], res[1], res[2:]
+        if score < threshold:
+            continue
+        xmin, ymin, xmax, ymax = bbox
+        draw.line(
+            [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
+             (xmin, ymin)],
+            width=2,
+            fill=(255, 0, 0))
+        print('category id is {}, bbox is {}'.format(cat_id, bbox))
+    img.save(save_name, quality=95)